In [1]:
import os
import joblib
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    """
    Reads a dataframe from a file and performs initial preprocessing.

    Args:
        filename (str): The path to the file to read.

    Returns:
        df (DataFrame): The preprocessed dataframe.
    """
    
    num_records = None

    try:
        # Check the file extension and read the file accordingly
        if filename.endswith('.csv'):
            df = pd.read_csv(filename)
            # Convert columns to datetime
            df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
            df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
        elif filename.endswith('.parquet'):
            df = pd.read_parquet(filename)
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None
    
    
    num_columns = df.shape[1]
    num_records_before = df.shape[0]
    
    # Calculate trip duration in minutes
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    std_duration = df['duration'].std()
    
    
    

    # Filter out records with duration less than 1 minute or more than 60 minutes
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()

    num_records_after = df.shape[0]
    records_dropped = num_records_before - num_records_after

    # Calculate fraction of records remaining
    fraction_remaining = num_records_after / num_records_before

    
        # Extract month from filename
    month = filename.split('_')[-1].split('.')[0]
    if month == '2022-01':
        print(f'The dataframe has {num_columns} columns.')
        print(f'Standard deviation of duration for January: {std_duration} minutes')
        print(f'Number of records before: {num_records_before}')
        print(f'Number of records after: {num_records_after}')
        print(f'Number of records dropped: {records_dropped}')
        print(f'Fraction of records left after dropping outliers: {fraction_remaining:.2%}')
        
        
    return df


In [3]:
def train_model(df_train, df_val, categorical, target):
    """
    Function to train the model and calculate RMSE for train and validation sets.

    Args:
        df_train (DataFrame): The training data.
        df_val (DataFrame): The validation data.
        categorical (list): List of categorical features.
        target (str): The target variable.

    Returns:
        dv (DictVectorizer): The fitted DictVectorizer.
        lr (LinearRegression): The trained linear regression model.
        rmse_train (float): RMSE for the training set.
        rmse_val (float): RMSE for the validation set.
    """
    
    # Initialize DictVectorizer
    dv = DictVectorizer()
    
    # Transform the dataframe into a list of dictionaries
    train_dicts = df_train[categorical].to_dict(orient='records')
    
    # Fit and transform the data
    X_train = dv.fit_transform(train_dicts)
    
    # The dimensionality of this matrix is the number of columns in X_train
    dimensionality = X_train.shape[1]
    print(f'The dimensionality of the matrix is {dimensionality}')
    print(len(dv.feature_names_))

    # Transform the validation dataframe into a list of dictionaries
    val_dicts = df_val[categorical].to_dict(orient='records')
    
    # Transform the validation data to a feature matrix using the same DictVectorizer
    X_val = dv.transform(val_dicts)

    # Get the target values for the training and validation data
    y_train = df_train[target].values
    y_val = df_val[target].values

    # Initialize a linear regression model
    lr = LinearRegression()
    
    # Train the model on the training data
    lr.fit(X_train, y_train)

    # Make predictions on the training and validation data
    y_pred_train = lr.predict(X_train)
    y_pred_val = lr.predict(X_val)

    # Calculate the root mean square error of the predictions on the training and validation data
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
    
    print(f'Training RMSE: {rmse_train}')
    print(f'Validation RMSE: {rmse_val}')

    return dv, lr, rmse_train, rmse_val


In [4]:
def save_model(dv, model, model_path):
    """
    Function to save the model.
    """
    if not os.path.exists(os.path.dirname(model_path)):
        os.makedirs(os.path.dirname(model_path))

    with open(model_path, 'wb') as f_out:
        joblib.dump((dv, model), f_out)


In [5]:
df_train = read_dataframe('~/anaconda3/envs/project/mlops-zoomcamp/data/yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('~/anaconda3/envs/project/mlops-zoomcamp/data/yellow_tripdata_2022-02.parquet')


The dataframe has 19 columns.
Standard deviation of duration for January: 46.44530513776499 minutes
Number of records before: 2463931
Number of records after: 2421440
Number of records dropped: 42491
Fraction of records left after dropping outliers: 98.28%


In [6]:
categorical = ['PULocationID', 'DOLocationID']
target = 'duration'

dv, model, rmse_train, rmse_val = train_model(df_train, df_val, categorical, target)

# save_model(dv, model, 'models/lin_reg.bin')

The dimensionality of the matrix is 2
2
Training RMSE: 8.920327827581444
Validation RMSE: 9.638272212087236
