In [1]:
!python -V

Python 3.11.7


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle

### Question 1: Downloading the data
How many columns in January dataset?

In [3]:
!ls ../data/

yellow_tripdata_2023-01.parquet  yellow_tripdata_2023-02.parquet


In [4]:
train_dataset = '../data/yellow_tripdata_2023-01.parquet'
val_dataset = '../data/yellow_tripdata_2023-02.parquet'

In [6]:
df = pd.read_parquet(train_dataset)
len(df.columns)

19

In [7]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

### Question 2: Computing duration
Standard deviation of the trips duration in January

In [8]:
df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration.std()

Timedelta('0 days 00:42:35.661074')

### Question 3: Dropping Outliers
What fraction of the records are left after you dropped the outliers?

In [9]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    # calculate duration from data provided
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: round((td.seconds / 60), 0))

    # drop outliers
    df = df[(df.duration >=1) & (df.duration <= 60)]

    # prep data for DictVectorizer
    categorical = ["PULocationID", "DOLocationID"]
    numeric = ["trip_distance"]
    df[categorical] = df[categorical].astype(str)
    dv_dict = df[categorical + numeric].to_dict(orient="records")
    
    return df, dv_dict
    

In [10]:
df_train, train_dict = read_dataframe(train_dataset)
df_val, val_dict = read_dataframe(val_dataset)

In [11]:
len(df_train) / len(df)

0.9837790036800982

### Question 4: One-hot encoding
What's the dimensionality of the OHE matrix (number of columns)?

In [12]:
dv = DictVectorizer()

In [13]:
X_train = dv.fit_transform(train_dict)
len(dv.feature_names_)

516

### Question 5: Training a model
What's the RMSE on train?

In [14]:
target = 'duration'
y_train = df_train[target].values

In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

In [16]:
mean_squared_error(y_train, y_pred, squared=False)

7.723925235932851

### Question 6: Evaluating the model
What's the RMSE on validation?

In [17]:
X_val = dv.transform(val_dict)
y_val = df_val[target].values
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

7.907250437756618

### Save the model and vector matrix

In [18]:
with open("../models/24-05-17_lin_reg.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

### Notes from video
Not part of homework

In [None]:
sns.distplot(df_train.duration)

In [None]:
df_train.duration.describe(percentiles=[0.95, 0.98, 0.99])

In [None]:
sns.distplot(y_pred, label="prediction")
sns.distplot(y_train, label="actual")
plt.legend()