In [28]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [3]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

In [4]:
len(df.columns)

19

In [5]:
df['duration'] = pd.to_datetime(df.tpep_dropoff_datetime) - pd.to_datetime(df.tpep_pickup_datetime)

In [6]:
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
df.duration

0           8.433333
1           6.316667
2          12.750000
3           9.616667
4          10.833333
             ...    
3066761    13.983333
3066762    19.450000
3066763    24.516667
3066764    13.000000
3066765    14.400000
Name: duration, Length: 3066766, dtype: float64

In [8]:
df.duration.std()

42.594351241920904

In [9]:
len(df)

3066766

In [10]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [11]:
len(df)

3009173

In [12]:
fraction = 3009173 * 100 / 3066766
fraction

98.1220282212598

In [13]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [15]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)

In [16]:
dv = DictVectorizer()
dicts = df[categorical].to_dict(orient='records')
X = dv.fit_transform(dicts)

In [20]:
# print(X.toarray())
len(dv.get_feature_names_out())

515

In [39]:
print(dv.get_feature_names_out())

['DOLocationID=1' 'DOLocationID=10' 'DOLocationID=100' 'DOLocationID=101'
 'DOLocationID=102' 'DOLocationID=106' 'DOLocationID=107'
 'DOLocationID=108' 'DOLocationID=109' 'DOLocationID=11'
 'DOLocationID=111' 'DOLocationID=112' 'DOLocationID=113'
 'DOLocationID=114' 'DOLocationID=115' 'DOLocationID=116'
 'DOLocationID=117' 'DOLocationID=118' 'DOLocationID=119'
 'DOLocationID=12' 'DOLocationID=120' 'DOLocationID=121'
 'DOLocationID=122' 'DOLocationID=123' 'DOLocationID=124'
 'DOLocationID=125' 'DOLocationID=126' 'DOLocationID=127'
 'DOLocationID=128' 'DOLocationID=129' 'DOLocationID=13'
 'DOLocationID=130' 'DOLocationID=131' 'DOLocationID=132'
 'DOLocationID=133' 'DOLocationID=134' 'DOLocationID=135'
 'DOLocationID=136' 'DOLocationID=137' 'DOLocationID=138'
 'DOLocationID=139' 'DOLocationID=14' 'DOLocationID=140'
 'DOLocationID=141' 'DOLocationID=142' 'DOLocationID=143'
 'DOLocationID=144' 'DOLocationID=145' 'DOLocationID=146'
 'DOLocationID=147' 'DOLocationID=148' 'DOLocationID=149'
 '

In [21]:
target = 'duration'
y = df[target]

In [24]:
lr = LinearRegression()
lr.fit(X, y)

In [26]:
pred = lr.predict(X)

In [29]:
root_mean_squared_error(y, pred)

7.6492619633678824

In [30]:
df_val = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

In [32]:
df_val['duration'] = pd.to_datetime(df_val.tpep_dropoff_datetime) - pd.to_datetime(df_val.tpep_pickup_datetime)
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val.duration

0           1.683333
1           0.233333
2           0.233333
3          32.083333
4          13.300000
             ...    
2913950    19.000000
2913951    11.133333
2913952    14.000000
2913953     7.000000
2913954     9.800000
Name: duration, Length: 2913955, dtype: float64

In [33]:
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

In [34]:
categorical = ['PULocationID', 'DOLocationID']
df_val[categorical] = df_val[categorical].astype(str)

In [41]:
dicts_val = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(dicts_val)

In [42]:
y_pred = lr.predict(X_val)

In [43]:
target = 'duration'
y_val = df_val[target]

In [44]:
root_mean_squared_error(y_val, y_pred)

7.81181893596011