In [1]:
import os
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    """
    Function to read the dataframe and perform initial preprocessing.
    """
    try:
        if filename.endswith('.csv'):
            df = pd.read_csv(filename)
            df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
            df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
        elif filename.endswith('.parquet'):
            df = pd.read_parquet(filename)
            print(df.dtypes)
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None

    df['duration'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()
    categorical = ['PULocationID', 'DOLocationID']
    df.loc[:, categorical] = df[categorical].astype(str)
    
    return df

In [3]:
def train_model(df_train, df_val, categorical, numerical, target):
    """
    Function to train the model.
    """
    dv = DictVectorizer()
    train_dicts = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)

    val_dicts = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    y_train = df_train[target].values
    y_val = df_val[target].values

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    return dv, lr, rmse

In [4]:
def save_model(dv, model, model_path):
    """
    Function to save the model.
    """
    if not os.path.exists(os.path.dirname(model_path)):
        os.makedirs(os.path.dirname(model_path))

    with open(model_path, 'wb') as f_out:
        joblib.dump((dv, model), f_out)


In [5]:
# df_train = read_dataframe('../data/green_tripdata_2021-01.parquet')
# df_val = read_dataframe('../data/green_tripdata_2021-02.parquet')

df_train = read_dataframe('C:/Users/dimi/anaconda3/envs/project/mlops-zoomcamp/data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('C:/Users/dimi/anaconda3/envs/project/mlops-zoomcamp/data/green_tripdata_2021-02.parquet')


df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO']
numerical = ['trip_distance']
target = 'duration'

dv, model, rmse = train_model(df_train, df_val, categorical, numerical, target)
print(f"RMSE: {rmse}")

save_model(dv, model, 'models/lin_reg.bin')

VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int64
DOLocationID                      int64
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                        object
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
dtype: object
VendorID                          int64
lpep_pickup_datetime     datetime64[ns]
lpep_dropoff_datetime    datetime64[ns]
store_and_fwd_flag               object
RatecodeID                