<a href="https://colab.research.google.com/github/emailmenojunk/datascience/blob/main/ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Load the yellow_tripdata_2022-01.parquet file into Pandas


In [None]:
#Display the first few rows of the dataset
df=pd.read_parquet('/content/yellow_tripdata_2022-01.parquet')
df.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [None]:
# Drop rows with missing values.
df=df.dropna()



In [None]:
# Create new feature, 'trip_duration'.
df.tpep_pickup_datetime=pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime=pd.to_datetime(df.tpep_dropoff_datetime)
df.trip_duration=df.tpep_dropoff_datetime-df.tpep_pickup_datetime
df.trip_duration=df.trip_duration.dt.total_seconds()/60

  df.trip_duration=df.tpep_dropoff_datetime-df.tpep_pickup_datetime


In [None]:
# Ensure 'trip_duration' is in df before creating feature_columns
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

feature_columns = df.columns.tolist()
feature_columns.remove('trip_duration')

In [None]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[feature_columns], df['trip_duration'], test_size=0.2, random_state=42)


In [None]:
# model to predict mean total fare of training data set
# Create a baseline for mean absolute error of total amount
mean_total_amount = y_train.mean()
y_pred_baseline = [mean_total_amount] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print(f"Mean Total Amount: {mean_total_amount:.2f}")
print(f"Baseline MAE: {mae_baseline:.2f}")




In [None]:
print(X_train.dtypes)

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Identify numerical and categorical columns based on dtypes
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
# Exclude the datetime columns from numerical features as they will be handled differently or dropped
numerical_features = [col for col in numerical_features if col not in ['tpep_pickup_datetime', 'tpep_dropoff_datetime']]

categorical_features = X_train.select_dtypes(include='object').columns.tolist()

# Create transformers for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore') # handle_unknown='ignore' is useful for unseen categories in test data

# Create a ColumnTransformer to apply different transformations to different columns
# We will drop the columns that were not transformed (like datetime columns)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)],
    remainder='drop')

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Display the shape of the processed training data
print("Shape of original X_train:", X_train.shape)
print("Shape of processed X_train:", X_train_processed.shape)

Shape of original X_train: (1913942, 19)
Shape of processed X_train: (1913942, 18)


In [None]:
# Get the feature names after transformation
processed_feature_names = preprocessor.get_feature_names_out()
print("Processed feature names:", processed_feature_names)
print("Number of processed features:", len(processed_feature_names))

Processed feature names: ['num__VendorID' 'num__passenger_count' 'num__trip_distance'
 'num__RatecodeID' 'num__PULocationID' 'num__DOLocationID'
 'num__payment_type' 'num__fare_amount' 'num__extra' 'num__mta_tax'
 'num__tip_amount' 'num__tolls_amount' 'num__improvement_surcharge'
 'num__total_amount' 'num__congestion_surcharge' 'num__airport_fee'
 'cat__store_and_fwd_flag_N' 'cat__store_and_fwd_flag_Y']
Number of processed features: 18


In [None]:
#linear regression model pipeline
# Create a pipeline object containing the column transformations and regression model.
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

predict=pipeline.predict(X_test)

rl_mae = mean_absolute_error(y_test,predict)


In [None]:
# Build random forest regressor model
# Create a pipeline object containing the column transformations and the random forest model
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor',RandomForestRegressor(n_estimators=10,random_state=42))])

# Fit the pipeline on the training data
rf_pipeline.fit(X_train, y_train)



In [None]:
#Random foreset model 1 with estimator = 20 and randomstate =42
rf_predict=rf_pipeline.predict(X_test)
rf_mae = mean_absolute_error(y_test,rf_predict)

In [None]:
#Evaluate the performance of the model on the test data using mean absolute error as a metric.

print("\n--- Model Performance on Test Data ---")
print(f"Random Forest Regressor MAE: {rf_mae:.2f}")
print(f"Linear Regression MAE: {rl_mae:.2f}")



--- Model Performance on Test Data ---
Random Forest Regressor MAE: 4.75
Linear Regression MAE: 5.91


In [None]:
#  Compare the models linear vs random forest on mae metrics ---
if rf_mae < rl_mae:
    print("\nThe Random Forest model performed better than the Linear Regression model.")
elif rl_mae < rf_mae:
    print("\nThe Linear Regression model performed better than the Random Forest model.")
else:
    print("\nBoth models performed about the same.")


The Random Forest model performed better than the Linear Regression model.


In [None]:
#Mess around with various input parameter configurations to see how they affect the model.
#Random foreset model 2 with estimator = 15 and randomstate =42
rf_model2 = RandomForestRegressor(n_estimators=15, random_state=42, n_jobs=-1)
# Create a pipeline object containing the column transformations and the random forest model
rf_pipeline2 = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor',rf_model2)])

# Fit the pipeline on the training data
rf_pipeline2.fit(X_train, y_train)

In [None]:
#Random foreset model 2 predict
rf_predict2=rf_pipeline2.predict(X_test)
rf2_mae = mean_absolute_error(y_test,rf_predict)

In [None]:
print("\n--- Random Foreset Model 1 Vs Model 2 Performance on Test Data ---")
print(f"Random Forest Regressor 1 MAE: {rf_mae:.2f}")
print(f"Random Forest Regressor 2 MAE: {rf2_mae:.2f}")
if rf_mae < rf2_mae:
    print("\nThe Random Forest model 1 performed better than the Random Forest model 2.")
elif rf2_mae < rf_mae:
    print("\nThe Linear Random Forest Model 2 performed better than the Random Forest model 1.")
else:
    print("\nBoth models performed about the same.")


--- Model Performance on Test Data ---
Random Forest Regressor 1 MAE: 4.75
Random Forest Regressor 2 MAE: 4.75

Both models performed about the same.


In [None]:
# Grid Search Setup
# Define the hyperparameters to tune.

param_grid = {
    'regressor__n_estimators': [5, 10],
    'regressor__max_depth': [5, 10],
    'regressor__min_samples_split': [2]
}

print("Starting Random Forest Grid Search on full training data...")
grid_search = GridSearchCV(estimator=rf_pipeline, param_grid=param_grid, cv=2, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Random Forest Grid Search complete!")

# --- 6. Get the best model and its parameters ---
best_params = grid_search.best_params_
print(f"\nBest parameters found: {best_params}")

# --- 7. Fit the best classifier on the full training data ---
print("\nTraining final model with best parameters on full training data...")
final_rf_pipeline = grid_search.best_estimator_

# The best estimator is already a fitted pipeline, so no need to refit.
print("Final Random Forest training complete!")

# --- 8. Make predictions on the test data ---
final_rf_predictions = final_rf_pipeline.predict(X_test)

# --- 9. Evaluate the model ---
final_rf_mae = mean_absolute_error(y_test, final_rf_predictions)
print(f"Final Random Forest Regressor MAE on Test Data: {final_rf_mae:.2f}")


Starting Random Forest Grid Search on full training data...
Fitting 2 folds for each of 4 candidates, totalling 8 fits
Random Forest Grid Search complete!

Best parameters found: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 10}

Training final model with best parameters on full training data...
Final Random Forest training complete!
Final Random Forest Regressor MAE on Test Data: 3.67
