In [42]:
import requests
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import root_mean_squared_error

## Data Loading

In [1]:
!mkdir data

In [6]:
files = [('green_tripdata_2024-10.parquet', './data'), 
         ('green_tripdata_2024-11.parquet', './data')]

print("Downloading started:...")

for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    save_path = f"{path}/{file}"
    response = requests.get(url, stream=True)

    with open(save_path, "wb") as handle:
        for data in response.iter_content(chunk_size=1024):
            handle.write(data)
    

Downloading started:...


In [8]:
train_data = pd.read_parquet("data/green_tripdata_2024-10.parquet")
test_data = pd.read_parquet("data/green_tripdata_2024-11.parquet")

In [9]:
train_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-10-01 00:52:13,2024-10-01 01:02:39,N,1.0,75,238,1.0,2.1,12.8,1.0,0.5,0.0,0.0,,1.0,18.05,1.0,1.0,2.75
1,2,2024-10-01 00:56:34,2024-10-01 01:03:51,N,1.0,134,82,1.0,4.86,19.8,1.0,0.5,0.0,0.0,,1.0,22.3,2.0,1.0,0.0
2,2,2024-10-01 00:23:31,2024-10-01 00:45:17,N,1.0,202,260,1.0,3.77,22.6,1.0,0.5,0.0,0.0,,1.0,25.1,2.0,1.0,0.0
3,2,2024-10-01 00:25:02,2024-10-01 00:37:16,N,1.0,130,218,1.0,3.11,15.6,1.0,0.5,0.0,0.0,,1.0,18.1,2.0,1.0,0.0
4,2,2024-10-01 00:11:11,2024-10-01 00:25:43,N,1.0,42,94,2.0,4.48,21.9,1.0,0.5,1.0,0.0,,1.0,25.4,1.0,1.0,0.0


In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56147 entries, 0 to 56146
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               56147 non-null  int32         
 1   lpep_pickup_datetime   56147 non-null  datetime64[us]
 2   lpep_dropoff_datetime  56147 non-null  datetime64[us]
 3   store_and_fwd_flag     54502 non-null  object        
 4   RatecodeID             54502 non-null  float64       
 5   PULocationID           56147 non-null  int32         
 6   DOLocationID           56147 non-null  int32         
 7   passenger_count        54502 non-null  float64       
 8   trip_distance          56147 non-null  float64       
 9   fare_amount            56147 non-null  float64       
 10  extra                  56147 non-null  float64       
 11  mta_tax                56147 non-null  float64       
 12  tip_amount             56147 non-null  float64       
 13  t

In [10]:
num_features = ['total_amount', 'trip_distance', 'passenger_count']
cat_features = ['PULocationID', 'DOLocationID']

In [37]:
def preprocessing(data):
    data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
    data.duration = data.duration.apply(lambda td: td.total_seconds()/60)
    data = data[(data.duration >= 3.) & (data.duration <= 90.)] 
    data.fillna(0, inplace=True) #maybe debug this later
    return data

In [38]:
X_train = preprocessing(train_data)[num_features + cat_features]
y_train = preprocessing(train_data)['duration']

X_test = preprocessing(test_data)[num_features + cat_features]
y_test = preprocessing(test_data)['duration']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.fillna(0, inplace=True)


In [35]:
X_train[X_train.passenger_count == np.nan]

Unnamed: 0,total_amount,trip_distance,passenger_count,PULocationID,DOLocationID


In [39]:
X_train.isnull().sum()

total_amount       0
trip_distance      0
passenger_count    0
PULocationID       0
DOLocationID       0
dtype: int64

In [40]:
X_test.isnull().sum()

total_amount       0
trip_distance      0
passenger_count    0
PULocationID       0
DOLocationID       0
dtype: int64

In [44]:
model = LinearRegression()
model.fit(X_train, y_train)

In [45]:
train_pred = model.predict(X_train)
root_mean_squared_error(train_pred, y_train)

6.929022561152728

In [46]:
test_pred = model.predict(X_test)
root_mean_squared_error(test_pred, y_test)

6.671806092249931

In [None]:
test_pred.mean, test_pred.s