In [1]:
import datetime
import pandas as pd


from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet("../data/fhv_tripdata_2021-01.parquet")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154112 entries, 0 to 1154111
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1154112 non-null  object        
 1   pickup_datetime         1154112 non-null  datetime64[ns]
 2   dropOff_datetime        1154112 non-null  datetime64[ns]
 3   PUlocationID            195845 non-null   float64       
 4   DOlocationID            991892 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1153227 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 61.6+ MB


In [3]:
categorical = ['PUlocationID', 'DOlocationID']

## Q1: number of records?

In [4]:
num_records_orig = df.shape[0]
print(f"Number of records: {num_records_orig}")

Number of records: 1154112


## Q2: avg duration in Jan (minutes)

In [5]:
df["duration"] = (df["dropOff_datetime"] - df["pickup_datetime"]) / datetime.timedelta(minutes=1)
print(f"Average ride time: {df['duration'].mean():.4f}")

Average ride time: 19.1672


## Analyze duration distribution.
Remove outliers (keep only records with duration [1, 60])
How many records did you remove?

In [6]:
df.duration.describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [7]:
df = df[(df.duration >= 1) & (df.duration <= 60)]
num_records_no_outliers = df.shape[0]
print(f"Number of outliers: {num_records_orig - num_records_no_outliers}")

Number of outliers: 44286


## Q3: fraction of missing values for the pickup ID

In [8]:
df[categorical] = df[categorical].fillna(value=-1)
frac_missing_pickup_id = df[df.PUlocationID == -1].shape[0] / num_records_no_outliers
print(f"Pickup ID is missing in {frac_missing_pickup_id * 100:.2f}% cases")

Pickup ID is missing in 83.53% cases


## Q4: apply one-hot encoding to the pickup and dropoff location IDs. What is the dimensionality of the matrix?

In [9]:
vect = DictVectorizer()
train_dicts = df[categorical].astype(str).to_dict(orient="records")

X_train = vect.fit_transform(train_dicts)
print(f"Dimensionality: {X_train.shape[1]}")

Dimensionality: 525


## Q5: Train a plain linear regression model with default parameters. What's the RMSE on train?

In [10]:
y_train = df.duration.values
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)

rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f"Model RMSE on train is {rmse:.2f}")

Model RMSE on train is 10.53


## Q6: apply this model to the validation dataset (Feb 2021). What's the RMSE on validation?

In [11]:
df_val = pd.read_parquet("../data/fhv_tripdata_2021-02.parquet")
df_val["duration"] = (df_val["dropOff_datetime"] - df_val["pickup_datetime"]) / datetime.timedelta(minutes=1)
y_val = df_val["duration"].values

val_dicts = df_val[categorical].astype(str).to_dict(orient="records")
X_val = vect.transform(val_dicts)

y_pred_val = model.predict(X_val)

rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"Model RMSE on validation is {rmse_val:.2f}")

Model RMSE on validation is 161.00


The answer doesn't seem to be close to any of the suggested options. 
Let's try removing outliers from the validation set. Questionable decision, but this will ensure that input data has the same distribution as the training data. 
In production we might have another model or another special way of dealing with outliers

In [12]:
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
y_val = df_val["duration"].values

val_dicts = df_val[categorical].astype(str).to_dict(orient="records")
X_val = vect.transform(val_dicts)

y_pred_val = model.predict(X_val)

rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"Model RMSE on validation is {rmse_val:.2f}")

Model RMSE on validation is 11.36
