In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
df_train = df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
df_val = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')

In [6]:
df_train.describe

<bound method NDFrame.describe of         dispatching_base_num     pickup_datetime    dropOff_datetime  \
0                     B00009 2021-01-01 00:27:00 2021-01-01 00:44:00   
1                     B00009 2021-01-01 00:50:00 2021-01-01 01:07:00   
2                     B00013 2021-01-01 00:01:00 2021-01-01 01:51:00   
3                     B00037 2021-01-01 00:13:09 2021-01-01 00:21:26   
4                     B00037 2021-01-01 00:38:31 2021-01-01 00:53:44   
...                      ...                 ...                 ...   
1154107               B03266 2021-01-31 23:43:03 2021-01-31 23:51:48   
1154108               B03284 2021-01-31 23:50:27 2021-02-01 00:48:03   
1154109      B03285          2021-01-31 23:13:46 2021-01-31 23:29:58   
1154110      B03285          2021-01-31 23:58:03 2021-02-01 00:17:29   
1154111               B03321 2021-01-31 23:39:00 2021-02-01 00:15:00   

         PUlocationID  DOlocationID SR_Flag Affiliated_base_number  
0                 NaN           

### Q1: Read the data for January. How many records are there?

In [7]:
1154112

1154112

### Q2: Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the average trip duration in January?

In [8]:
df_train['duration'] = df_train.dropOff_datetime - df_train.pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)


df_train['duration']

0           17.000000
1           17.000000
2          110.000000
3            8.283333
4           15.216667
              ...    
1154107      8.750000
1154108     57.600000
1154109     16.200000
1154110     19.433333
1154111     36.000000
Name: duration, Length: 1154112, dtype: float64

In [9]:
df_train['duration'].mean()

19.1672240937939

19.1672240937939


In [10]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)].copy()
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)].copy()

### Q3. Missing values
The features we'll use for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1".

What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.

In [11]:
len(df_train.PUlocationID.loc[df_train.PUlocationID.isna()]) / len(df_train)

0.8352732770722617

In [12]:
df_train.PUlocationID = df_train.PUlocationID.fillna(-1).astype(str)
df_train.DOlocationID = df_train.DOlocationID.fillna(-1).astype(str)

df_val.PUlocationID = df_val.PUlocationID.fillna(-1).astype(str)
df_val.DOlocationID = df_val.DOlocationID.fillna(-1).astype(str)

### Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries
Fit a dictionary vectorizer
Get a feature matrix from it
What's the dimensionality of this matrix? (The number of columns).

In [13]:
dv = DictVectorizer()

train_dicts = df_train[['PUlocationID', 'DOlocationID']].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[['PUlocationID', 'DOlocationID']].to_dict(orient='records')
X_val = dv.transform(val_dicts)


In [14]:
train_dicts[:10]

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'}]

In [15]:
X_train.shape

(1109826, 525)

In [16]:
X_val.shape

(990113, 525)

#### 2

#### Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters
Calculate the RMSE of the model on the training data
What's the RMSE on train?

5.52
10.52
15.52
20.52


In [17]:
y_train = df_train['duration'].values
y_val = df_val['duration'].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)


10.52851910722287

#### Q6. Evaluating the model
Now let's apply this model to the validation dataset (Feb 2021).

What's the RMSE on validation?

6.01
11.01
16.01
21.01

In [19]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)


11.014283229248326