In [1]:
import pandas as pd

In [2]:
df_jan = pd.read_parquet('data/fhv_tripdata_2021-01.parquet')
df_feb = pd.read_parquet('data/fhv_tripdata_2021-02.parquet')
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


# Q1. Downloading the data

In [3]:
print(f'There are {len(df_jan)} records in the January 2021 data.')

There are 1154112 records in the January 2021 data.


# Q2. Computing duration

In [4]:
df_jan['duration'] = (df_jan['dropOff_datetime'] - \
                      df_jan['pickup_datetime']).dt.total_seconds() / 60.0
df_feb['duration'] = (df_feb['dropOff_datetime'] - \
                      df_feb['pickup_datetime']).dt.total_seconds() / 60.0

In [5]:
mean_duration = df_jan['duration'].mean()
print(f'The average trip duration for January is {mean_duration:.2f}.')

The average trip duration for January is 19.17.


# Data preparation

In [6]:
df_jan_normal = df_jan[(df_jan['duration'] <= 60) & (df_jan['duration'] >= 1)]
df_feb_normal = df_feb[(df_feb['duration'] <= 60) & (df_feb['duration'] >= 1)]

In [7]:
drop_jan = len(df_jan) - len(df_jan_normal)
drop_feb = len(df_feb) - len(df_feb_normal)

print(f'There are {drop_jan} records dropped in January and {drop_feb} '
      f'records dropped in Feburary.')

There are 44286 records dropped in January and 47579 records dropped in Feburary.


# Q3. Missing values

In [8]:
df_jan_normal['PUlocationID'].fillna(-1, inplace=True)
df_jan_normal['DOlocationID'].fillna(-1, inplace=True)
df_feb_normal['PUlocationID'].fillna(-1, inplace=True)
df_feb_normal['DOlocationID'].fillna(-1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [9]:
fraction_missing = (df_jan_normal['PUlocationID'] == -1).mean()
print(f'The fraction of missing values for pickup location ID in January'
      f' is {fraction_missing:.2f}.')

The fraction of missing values for pickup location ID in January is 0.84.


# Q4. One-hot encoding

In [10]:
from sklearn.feature_extraction import DictVectorizer

In [11]:
df_jan_normal.loc[:, 'PUlocationID'] = df_jan_normal['PUlocationID'].astype(str)
df_jan_normal.loc[:, 'DOlocationID'] = df_jan_normal['DOlocationID'].astype(str)
df_feb_normal.loc[:, 'PUlocationID'] = df_feb_normal['PUlocationID'].astype(str)
df_feb_normal.loc[:, 'DOlocationID'] = df_feb_normal['DOlocationID'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [12]:
jan_records = df_jan_normal[['PUlocationID', 'DOlocationID']]\
                    .to_dict(orient='records')

In [13]:
v = DictVectorizer()
X_train = v.fit_transform(jan_records)

In [14]:
n_cols = X_train.shape[1]
print(f'The dimensionality of the matrix is {n_cols}.')

The dimensionality of the matrix is 525.


# Q5. Training a model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [16]:
y_train = df_jan_normal['duration']

reg = LinearRegression().fit(X_train, y_train)

In [17]:
y_pred = reg.predict(X_train)

In [18]:
rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f'The RMSE on train is {rmse:.2f}.')

The RMSE on train is 10.53.


# Q6. Evaluating the model

In [19]:
feb_records = df_feb_normal[['PUlocationID', 'DOlocationID']]\
                    .to_dict(orient='records')

In [20]:
X_val = v.transform(feb_records)

In [21]:
y_val = df_feb_normal['duration']
y_pred_val = reg.predict(X_val)

rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(f'The RMSE on validation is {rmse_val:.2f}')

The RMSE on validation is 11.01
