In [1]:
import numpy as np
import pandas as pd
pd.__version__

'2.2.3'

In [2]:
import sklearn
sklearn.__version__

'1.6.1'

## Download the data
We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".
Download the data for January and February 2023.

In [3]:
# January 2023
df_jan = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
# February 2023
df_feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [4]:
df_jan.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0


## Q1: How many columns does the January 2023 data have?

In [5]:
print(len(df_jan.columns))

19


## Q2: Compute Duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.
What's the standard deviation of the trips duration in January?

In [6]:
def get_total_minutes(row):
    seconds = row.total_seconds()
    minutes = seconds / 60
    return minutes
    
duration = df_jan["tpep_dropoff_datetime"] - df_jan["tpep_pickup_datetime"]
duration = duration.apply(lambda x: get_total_minutes(x))

In [7]:
duration

0           8.433333
1           6.316667
2          12.750000
3           9.616667
4          10.833333
             ...    
3066761    13.983333
3066762    19.450000
3066763    24.516667
3066764    13.000000
3066765    14.400000
Length: 3066766, dtype: float64

In [8]:
print(f"The standard deviation is: {np.std(duration)}")

The standard deviation is: 42.59434429744777


## Q3: Dropping Outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [9]:
duration_no_outlier = duration[(duration >= 1) & (duration <= 60)]
fraction = np.round(len(duration_no_outlier)/len(duration),2) * 100
print(f"The fraction of the record left after dropping is: {fraction} %")

The fraction of the record left after dropping is: 98.0 %


## Q4: One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

   - Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
   - Fit a dictionary vectorizer
   - Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

In [30]:
categorical = ["PULocationID","DOLocationID"]
df_jan[categorical] = df_jan.loc[:, categorical].astype(str)
df_jan[categorical].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 2 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   PULocationID  object
 1   DOLocationID  object
dtypes: object(2)
memory usage: 46.8+ MB


In [31]:
features_dict = pu_do.to_dict(orient="records")

In [34]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer()
X_train = dv.fit_transform(features_dict)


In [36]:
print(X_train.shape[1])

518


In [32]:
dv.feature_names_

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=105',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109',
 'DOLocationID=11',
 'DOLocationID=111',
 'DOLocationID=112',
 'DOLocationID=113',
 'DOLocationID=114',
 'DOLocationID=115',
 'DOLocationID=116',
 'DOLocationID=117',
 'DOLocationID=118',
 'DOLocationID=119',
 'DOLocationID=12',
 'DOLocationID=120',
 'DOLocationID=121',
 'DOLocationID=122',
 'DOLocationID=123',
 'DOLocationID=124',
 'DOLocationID=125',
 'DOLocationID=126',
 'DOLocationID=127',
 'DOLocationID=128',
 'DOLocationID=129',
 'DOLocationID=13',
 'DOLocationID=130',
 'DOLocationID=131',
 'DOLocationID=132',
 'DOLocationID=133',
 'DOLocationID=134',
 'DOLocationID=135',
 'DOLocationID=136',
 'DOLocationID=137',
 'DOLocationID=138',
 'DOLocationID=139',
 'DOLocationID=14',
 'DOLocationID=140',
 'DOLocationID=141',
 'DOLocationID=142',
 'DOLocationID=143',
 'DOLocationID=144',

## Q5: Training a model
Now let's use the feature matrix from the previous step to train a model.

    - Train a plain linear regression model with default parameters, where duration is the response variable
    - Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [50]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [51]:
y_train = duration.values

In [52]:
model = LinearRegression()
model.fit(X_train,y_train)

In [46]:
y_pred = model.predict(X_train)

In [49]:
sns.distplot(y_pred, label="Predictions")

NameError: name 'sns' is not defined

In [55]:
mse = mean_squared_error(y_train, y_pred)
m

print("MSE:", mse)

MSE: 1763.7058208570154
