In [9]:
import pandas as pd
import numpy as np

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

In [11]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [12]:
train_filepath = "./data/yellow_tripdata_2023-01.parquet"
valid_filepath = "./data/yellow_tripdata_2023-02.parquet"

# *Q1.* Download the data
### Read the data for January. How many columns are there?

In [None]:
yellow_jan = pd.read_parquet("./data/yellow_tripdata_2023-01.parquet")

In [None]:
print(yellow_jan.shape[1])

# *Q2.* Computing duration
### What's the standard deviation of the trips duration in January?

In [None]:
yellow_jan['duration'] = yellow_jan['tpep_dropoff_datetime'] - yellow_jan['tpep_pickup_datetime']
yellow_jan['duration'] = yellow_jan['duration'].apply(lambda dt:dt.total_seconds()/60)

In [None]:
yellow_jan.head()

In [None]:
yellow_jan.columns

In [None]:
print(np.std(yellow_jan.duration))

# *Q3.* Dropping outliers
### What fraction of the records left after you dropped the outliers?

In [None]:
yellow_jan.duration.describe()

In [None]:
yellow_jan_filtered = yellow_jan[(yellow_jan.duration > 1) & (yellow_jan.duration < 60)]

In [None]:
print(f"{len(yellow_jan_filtered)/len(yellow_jan):.2%}")

# *Q4.* One-hot encoding
### What's the dimensionality of this matrix (number of columns)?

In [13]:
def read_and_transfrom(filepath):
    df = pd.read_parquet(filepath)
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype('str')
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda dt:dt.total_seconds()/60)
    df = df[(df.duration > 1) & (df.duration < 60)]
    return df

In [19]:
df_train = read_and_transfrom(train_filepath)
df_val = read_and_transfrom(valid_filepath)

In [22]:
categorical = ['PULocationID', 'DOLocationID']
target = 'duration'
dv = DictVectorizer()
train_dict = df_train[categorical].to_dict(orient = 'records')
val_dict = df_val[categorical].to_dict(orient = 'records')

X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

y_train = df_train[target]
y_val = df_val[target]
print(X_train.shape[1])

515


# *Q5.* Training a model

### What's the RMSE on train?

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
print(root_mean_squared_error(y_train,y_pred))

7.647511908824672


# *Q6.* Evaluating the model
### What's the RMSE on validation?

In [23]:
y_pred_val = lr.predict(X_val)
print(root_mean_squared_error(y_val,y_pred_val))

7.808398219721731
