In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [83]:
filename1 = 'fhv_tripdata_2021-01.parquet'
filename2 = 'fhv_tripdata_2021-02.parquet'

df = pd.read_parquet(filename1)
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [60]:
df.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
dtype: object

In [61]:
# Question 1
print("The number of records Jan 2021 FHV data is ",df.shape[0])

The number of records Jan 2021 FHV data is  1154112


In [62]:
# Question 2

In [63]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [64]:
print("Average duration in Jan 2021 FHV is ",df.duration.mean())

Average duration in Jan 2021 FHV is  19.1672240937939


In [65]:
# Question 3

In [66]:
df.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               958267
DOlocationID               162220
SR_Flag                   1154112
Affiliated_base_number        885
duration                        0
dtype: int64

In [67]:
df.PUlocationID.fillna(-1,inplace=True)
df.DOlocationID.fillna(-1,inplace=True)

In [68]:
fraction = (len(df[df.PUlocationID==-1])/len(df))*100
print(" the fractions of missing values for the pickup location ID is %.2f %s"%(fraction,"%"))

 the fractions of missing values for the pickup location ID is 83.03 %


In [69]:
# Question 4

In [70]:
def read_dataframe(df):

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [72]:
categorical = ['PUlocationID', 'DOlocationID']
df1 = read_dataframe(df)
df1[categorical] = df1[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [73]:
train_dicts = df1[categorical].to_dict(orient='records')
train_dicts

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '89.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '177.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '63.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '67.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '22.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '14.0'},
 {'PUlocationID': '-1.0', 'DO

In [74]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [75]:
print("The dimension of the matrix is ",X_train.shape[1])

The dimension of the matrix is  525


In [79]:
target = 'duration'
y_train = df1[target].values

In [80]:
# Question 5

In [81]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

rmse = mean_squared_error(y_train, y_pred, squared=False)
print("the RMSE on train is ",rmse)

the RMSE on train is  10.528519107211832


In [None]:
# Question 6

In [84]:
data = pd.read_parquet(filename2)
data.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037


In [86]:
data.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               884691
DOlocationID               152352
SR_Flag                   1037692
Affiliated_base_number          0
dtype: int64

In [87]:
data.PUlocationID.fillna(-1,inplace=True)
data.DOlocationID.fillna(-1,inplace=True)

In [88]:
data1 = read_dataframe(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [89]:
categorical1 = ['PUlocationID', 'DOlocationID']
data1[categorical1] = data1[categorical1].astype(str)

In [90]:
train_dicts1 = data1[categorical].to_dict(orient='records')
train_dicts1

[{'PUlocationID': '173.0', 'DOlocationID': '82.0'},
 {'PUlocationID': '173.0', 'DOlocationID': '56.0'},
 {'PUlocationID': '82.0', 'DOlocationID': '129.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '26.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '169.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '161.0'},
 {'PUlocationID': '13.0', 'DOlocationID': '182.0'},
 {'PUlocationID': '152.0', 'DOlocationID': '244.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '265.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '237.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '248.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '248.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '159.0'},
 {'PUlocationID':

In [91]:
X_val = dv.transform(train_dicts1)

In [92]:
target = 'duration'
y_val = data1[target].values

In [93]:
y_pred = lr.predict(X_val)

rmse1 = mean_squared_error(y_val, y_pred, squared=False)

In [94]:
print("the RMSE on validation is ",rmse1)

the RMSE on validation is  11.014283203471416
