This file is use to Analyze the modeling pipeline and modeling results.

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from preprocess import *


# Load the data
data = pd.read_csv("../Cleaning_Engineering/base_cleaned.csv")
data = data.drop(columns=['Unnamed: 0'])

# Show the first few rows of the dataframe
data.head()


Unnamed: 0,time_stamp_x,distance,cab_type,source,destination,name,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
0,1543203646318,3.03,Lyft,Boston University,Theatre District,Lux Black XL,Luxury SUV,1,0,41.07,0.86,1014.39,,0.92,1.36,0,40,1.0,34.0
1,1543203646319,1.3,Uber,South Station,Theatre District,Black,Luxury,1,0,40.86,0.87,1014.39,,0.93,1.6,0,40,1.0,18.5
2,1543203646320,2.43,Lyft,Northeastern University,Beacon Hill,Lyft,Base,1,0,40.81,0.89,1014.35,,0.93,1.36,0,40,1.0,10.5
3,1543203646320,2.71,Uber,Theatre District,Fenway,UberXL,Base XL,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,32.0
4,1543203646320,2.71,Uber,Theatre District,Fenway,UberX,Base,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,19.5


In [16]:
def data_transform(data):

    # Separate the features (X) and the target variable (y)
    X = data.drop('price', axis=1)
    y = data['price']

    # Preprocessing for numerical features
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical features
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Split the dataset into training (80%) and validation (20%) sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    # Preprocessing of training data, fit_transform
    X_train_prepared = preprocessor.fit_transform(X_train)
    
    # Preprocessing of validation data, transform
    X_valid_prepared = preprocessor.transform(X_valid)

    # Check the shape after preprocessing
    X_train_prepared.shape, X_valid_prepared.shape

    final_columns = preprocessor.get_feature_names_out()
    return (X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns)


In [17]:
def run_models(X_train_prepared, y_train, X_valid_prepared, y_valid):
    #Define a list of models
    models = [
        ('Linear Regression', LinearRegression()),
        ('Ridge Regression', Ridge(random_state=0)),
        ('Lasso Regression', Lasso(random_state=0)),
        ('Decision Tree Regression', DecisionTreeRegressor(random_state=0)),
        ("SGD Regression", SGDRegressor(random_state=0)),
        #('RandomForestRegressor', RandomForestRegressor(random_state=0)),
        ('GradienBoostingRegressor', GradientBoostingRegressor(random_state= 0)),
        #('SVR', SVR())
    ]

    # List to store results
    results = []
    # Loop through the list of models
    for name, model in models:
        # Fit the model
        model.fit(X_train_prepared, y_train)
        
        # Predict on the validation set
        y_pred_valid = model.predict(X_valid_prepared)
        
        # Evaluate the model
        mse = mean_squared_error(y_valid, y_pred_valid)
        r2 = r2_score(y_valid, y_pred_valid)
        
        # Store the results
        results.append((name, mse, r2))
        
        # Print the results

        print(f"{name} - MSE: {mse:.2f}, R^2: {r2:.2f}")
        if name == 'Decision Tree Regression':
            best_model = model

    return (best_model,best_model.feature_importances_)

In [18]:
# Model  the Base Data
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(get_base_data())
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 12.60, R^2: 0.84
Ridge Regression - MSE: 12.60, R^2: 0.84
Lasso Regression - MSE: 38.07, R^2: 0.52
Decision Tree Regression - MSE: 4.70, R^2: 0.94
SGD Regression - MSE: 12.61, R^2: 0.84
GradienBoostingRegressor - MSE: 7.31, R^2: 0.91


In [19]:
# Check for Overfitting By evaluating the Training Set
    
# Predict on the training set
y_pred_train = model.predict(X_train_prepared)

# Evaluate the model
mse = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)
print(f"MSE: {mse:.2f}, R^2: {r2:.2f}")

MSE: 4.29, R^2: 0.95


In [20]:
# Create a DataFrame to view the feature importances
features_df = pd.DataFrame({'feature': final_columns, "importance": feature_importances})
features_df = features_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(features_df)


                                     feature  importance
33                  cat__car_type_Luxury SUV    0.560024
32                      cat__car_type_Luxury    0.204527
31                     cat__car_type_Base XL    0.067769
18                 cat__destination_Back Bay    0.019468
24                cat__destination_North End    0.018184
22       cat__destination_Financial District    0.017436
10            cat__source_Financial District    0.012848
6                       cat__source_Back Bay    0.012208
11              cat__source_Haymarket Square    0.009146
23         cat__destination_Haymarket Square    0.006860
34                      cat__car_type_Shared    0.006584
5                         cat__cab_type_Uber    0.006395
20        cat__destination_Boston University    0.006164
27            cat__destination_South Station    0.005410
8              cat__source_Boston University    0.005332
15                 cat__source_South Station    0.005124
12                     cat__sou

In [21]:
# Model  the Dynamic Data
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(get_dynamic_data())
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 17.15, R^2: 0.80
Ridge Regression - MSE: 17.15, R^2: 0.80
Lasso Regression - MSE: 43.21, R^2: 0.50
Decision Tree Regression - MSE: 8.50, R^2: 0.90
SGD Regression - MSE: 17.16, R^2: 0.80
GradienBoostingRegressor - MSE: 11.22, R^2: 0.87


In [22]:
# Create a DataFrame to view the feature importances
features_df = pd.DataFrame({'feature': final_columns, "importance": feature_importances})
features_df = features_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(features_df)


                                     feature  importance
33                  cat__car_type_Luxury SUV    0.549692
32                      cat__car_type_Luxury    0.208626
31                     cat__car_type_Base XL    0.067296
15                 cat__source_South Station    0.020181
24                cat__destination_North End    0.017617
18                 cat__destination_Back Bay    0.013322
22       cat__destination_Financial District    0.012656
10            cat__source_Financial District    0.012350
11              cat__source_Haymarket Square    0.010629
6                       cat__source_Back Bay    0.010284
23         cat__destination_Haymarket Square    0.006935
5                         cat__cab_type_Uber    0.006647
34                      cat__car_type_Shared    0.006620
20        cat__destination_Boston University    0.006223
27            cat__destination_South Station    0.005701
21                   cat__destination_Fenway    0.005663
26  cat__destination_Northeaste

In [23]:
# Model  the Demand Data
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(get_demand_data())
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 86102.39, R^2: 0.38
Ridge Regression - MSE: 71454.32, R^2: 0.49
Lasso Regression - MSE: 58939.17, R^2: 0.58
Decision Tree Regression - MSE: 0.00, R^2: 1.00
SGD Regression - MSE: 58925.90, R^2: 0.58
GradienBoostingRegressor - MSE: 19666.77, R^2: 0.86


In [24]:
# These functions are specfically for testing how the models perform specifically on the surge data
def data_transform_surge(data):
    # Separate the features (X) and the target variable (y)
    X = data.drop('price', axis=1)
    y = data['price']

    # Preprocessing for numerical features
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical features
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])
    
    # Split the dataset into training (80%) and validation (20%) sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    # Save the indices of the validation set
    valid_indices = X_valid.index

    # Preprocessing of training data, fit_transform
    X_train_prepared = preprocessor.fit_transform(X_train)
    
    # Preprocessing of validation data, transform
    X_valid_prepared = preprocessor.transform(X_valid)

    # Check the shape after preprocessing
    X_train_prepared.shape, X_valid_prepared.shape

    final_columns = preprocessor.get_feature_names_out()
    return (X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns, valid_indices)


def run_models_surge(X_train_prepared, y_train, X_valid_prepared, y_valid, valid_indices):
    models = [
        ('Linear Regression', LinearRegression()),
        ('Ridge Regression', Ridge(random_state=0)),
        ('Lasso Regression', Lasso(random_state=0)),
        ('Decision Tree Regression', DecisionTreeRegressor(random_state=0)),
        ("SGD Regression", SGDRegressor(random_state=0)),
        #('RandomForestRegressor', RandomForestRegressor(random_state=0)),
        ('GradienBoostingRegressor', GradientBoostingRegressor(random_state= 0)),
        #('SVR', SVR())
    ]
    results = []
    for name, model in models:
        # ... [existing code] ...
        model.fit(X_train_prepared, y_train)
        # Filter X_valid and y_valid for surge_multiplier > 1
        surge_indices = data.loc[valid_indices, 'surge_multiplier'] > 1
        X_valid_surge = X_valid_prepared[surge_indices]
        y_valid_surge = y_valid[surge_indices]

        # Predict and evaluate on surge data
        y_pred_surge = model.predict(X_valid_surge)
        mse_surge = mean_squared_error(y_valid_surge, y_pred_surge)
        r2_surge = r2_score(y_valid_surge, y_pred_surge)
        results.append((name, mse_surge, r2_surge))
        print(f"{name} (Surge Data) - MSE: {mse_surge:.2f}, R^2: {r2_surge:.2f}")

    



In [25]:
# test the Dynamic Model on Surge > 1 Data Only 
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns,valid_indices = data_transform_surge(get_dynamic_data())
run_models_surge(X_train_prepared, y_train, X_valid_prepared, y_valid, valid_indices)

Linear Regression (Surge Data) - MSE: 151.09, R^2: 0.24
Ridge Regression (Surge Data) - MSE: 151.09, R^2: 0.24
Lasso Regression (Surge Data) - MSE: 246.61, R^2: -0.24
Decision Tree Regression (Surge Data) - MSE: 112.26, R^2: 0.44
SGD Regression (Surge Data) - MSE: 151.62, R^2: 0.24
GradienBoostingRegressor (Surge Data) - MSE: 133.32, R^2: 0.33


In [26]:
# Gris Search for Hyper Parameter Tuning
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

data = get_base_data()
# Separate the features (X) and the target variable (y)
X = data.drop('price', axis=1)
y = data['price']


# Preprocessing for numerical features
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
numerical_transformer = StandardScaler()

# Preprocessing for categorical features
categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])
# Define the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor(random_state=0))])

# Define the parameter grid to search
param_grid = {
    'regressor__max_depth': [3, 5, 10, 20, 30],
    'regressor__min_samples_split': [2, 4, 6]
}

# Set up the grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict and evaluate
y_pred_valid = best_model.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred_valid)
r2 = r2_score(y_valid, y_pred_valid)

print(f"Best model - MSE: {mse:.2f}, R^2: {r2:.2f}")


Best model - MSE: 4.70, R^2: 0.94


In [27]:
grid_search.best_params_

{'regressor__max_depth': 30, 'regressor__min_samples_split': 6}