In [1]:
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.


In [2]:
# read in data
data_202301 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
data_202302 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [3]:
def prep_df(df):
    # make copy of df
    df_new = df.copy()

    # calc duration in mins
    df_new["duration_min"] = (df_new.tpep_dropoff_datetime - df_new.tpep_pickup_datetime).dt.total_seconds() / 60

    # filter for trips between 1min & 60mins only
    df_new = df_new[(df_new.duration_min >= 1) & (df_new.duration_min <= 60)]

    # convert x var to string
    x_var = ["PULocationID", "DOLocationID"]
    df_new[x_var] = df_new[x_var].astype(str)

    return df_new

In [4]:
# Q1
len(data_202301.axes[1])

19

In [5]:
# Q2

# prep dataset
data_202301_filtered = prep_df(data_202301)

In [6]:
# calc std deviation
data_202301_filtered["duration_min"].std()

np.float64(9.939385620151036)

In [7]:
# Q3
data_202301_filtered.duration_min.describe()

count    3.009173e+06
mean     1.420486e+01
std      9.939386e+00
min      1.000000e+00
25%      7.216667e+00
50%      1.155000e+01
75%      1.818333e+01
max      6.000000e+01
Name: duration_min, dtype: float64

In [8]:
# calc % records left
len(data_202301_filtered.axes[0]) / len(data_202301.axes[0]) * 100

98.1220282212598

In [9]:
# Q4

x_var = ["PULocationID", "DOLocationID"]

# convert to list of dict, where each dict = 1 row
x_var_list_of_dict = data_202301_filtered[x_var].to_dict(orient='records')
x_var_list_of_dict[:10]

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'},
 {'PULocationID': '161', 'DOLocationID': '137'},
 {'PULocationID': '239', 'DOLocationID': '143'},
 {'PULocationID': '142', 'DOLocationID': '200'},
 {'PULocationID': '164', 'DOLocationID': '236'},
 {'PULocationID': '141', 'DOLocationID': '107'}]

In [10]:
# initiate DictVectorizer object
dv = DictVectorizer()
x_train = dv.fit_transform(x_var_list_of_dict)

In [20]:
x_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6018346 stored elements and shape (3009173, 515)>

In [12]:
# Q5

# extract response var
y_var = ["duration_min"]
y_train = data_202301_filtered[y_var].values

In [13]:
# fit LR model
lr = LinearRegression()
lr.fit(x_train, y_train)

In [14]:
# calc fitted y values based on train data
y_pred_train = lr.predict(x_train)

In [15]:
# calc RMSE
mean_squared_error(y_train, y_pred_train, squared = False)



np.float64(7.649261932106969)

In [16]:
# Q5

# prep dataset
data_202302_filtered = prep_df(data_202302)
data_202302_filtered.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration_min
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0,1.683333
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25,32.083333
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0,13.3
5,1,2023-02-01 00:52:40,2023-02-01 01:07:18,1.0,5.1,1.0,N,148,236,1,21.9,3.5,0.5,5.35,0.0,1.0,32.25,2.5,0.0,14.633333
6,1,2023-02-01 00:12:39,2023-02-01 00:40:36,1.0,8.9,1.0,N,137,244,1,41.5,3.5,0.5,3.5,0.0,1.0,50.0,2.5,0.0,27.95


In [17]:
# extract x_test
x_test_list_of_dict = data_202302_filtered[x_var].to_dict(orient='records')
x_test = dv.fit_transform(x_test_list_of_dict)

# extract y_test
y_test = data_202302_filtered[y_var].values

In [21]:
x_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5711902 stored elements and shape (2855951, 514)>

In [22]:
# calc test RMSE
y_pred_test = lr.predict(x_test)
mean_squared_error(y_test, y_pred_test, squared = False)

ValueError: X has 514 features, but LinearRegression is expecting 515 features as input.