In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score



# Load the data
data = pd.read_csv("../Cleaning_Engineering/base_cleaned.csv")

# Show the first few rows of the dataframe
data.head()


Unnamed: 0.1,Unnamed: 0,time_stamp_x,distance,cab_type,source,destination,name,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
0,0,1543203646318,3.03,Lyft,Boston University,Theatre District,Lux Black XL,Luxury SUV,1,0,41.07,0.86,1014.39,,0.92,1.36,0,40,1.0,34.0
1,1,1543203646319,1.3,Uber,South Station,Theatre District,Black,Luxury,1,0,40.86,0.87,1014.39,,0.93,1.6,0,40,1.0,18.5
2,2,1543203646320,2.43,Lyft,Northeastern University,Beacon Hill,Lyft,Base,1,0,40.81,0.89,1014.35,,0.93,1.36,0,40,1.0,10.5
3,3,1543203646320,2.71,Uber,Theatre District,Fenway,UberXL,Base XL,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,32.0
4,4,1543203646320,2.71,Uber,Theatre District,Fenway,UberX,Base,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,19.5


In [2]:
from preprocess import *

In [3]:
def data_transform(data):

    # Separate the features (X) and the target variable (y)
    X = data.drop('price', axis=1)
    y = data['price']

    # Preprocessing for numerical features
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical features
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Split the dataset into training (80%) and validation (20%) sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    # Preprocessing of training data, fit_transform
    X_train_prepared = preprocessor.fit_transform(X_train)
    
    # Preprocessing of validation data, transform
    X_valid_prepared = preprocessor.transform(X_valid)

    # Check the shape after preprocessing
    X_train_prepared.shape, X_valid_prepared.shape

    final_columns = preprocessor.get_feature_names_out()
    return (X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns)


In [4]:
def run_models(X_train_prepared, y_train, X_valid_prepared, y_valid):
    #Define a list of models
    models = [
        ('Linear Regression', LinearRegression()),
        ('Ridge Regression', Ridge(random_state=0)),
        ('Lasso Regression', Lasso(random_state=0)),
        ('Decision Tree Regression', DecisionTreeRegressor(random_state=0)),
        ("SGD Regression", SGDRegressor(random_state=0)),
        #('RandomForestRegressor', RandomForestRegressor(random_state=0)),
        ('GradienBoostingRegressor', GradientBoostingRegressor(random_state= 0)),
        #('SVR', SVR())
    ]

    # List to store results
    results = []
    # Loop through the list of models
    for name, model in models:
        # Fit the model
        model.fit(X_train_prepared, y_train)
        
        # Predict on the validation set
        y_pred_valid = model.predict(X_valid_prepared)
        
        # Evaluate the model
        mse = mean_squared_error(y_valid, y_pred_valid)
        r2 = r2_score(y_valid, y_pred_valid)
        
        # Store the results
        results.append((name, mse, r2))
        
        # Print the results

        print(f"{name} - MSE: {mse:.2f}, R^2: {r2:.2f}")
        if name == 'Decision Tree Regression':
            best_model = model

    return (best_model,best_model.feature_importances_)

In [9]:
data = data.drop(columns=['Unnamed: 0'])
#data["rain"].fillna(0.0, inplace=True)
#test_data = data.iloc[int(data.shape[0]*.9):]
#data = data.iloc[:int(data.shape[0]*.9)]



In [10]:
user_data = data[["cab_type", "source", "destination", "car_type", "weekday", "rush_hour", "is_raining", "temp_groups", "surge_multiplier", "price"]]

In [11]:
user_data_no_surge = user_data[user_data["surge_multiplier"] == 1.0]


In [10]:
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(user_data_no_surge)
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 12.60, R^2: 0.84
Ridge Regression - MSE: 12.60, R^2: 0.84
Lasso Regression - MSE: 38.07, R^2: 0.52
Decision Tree Regression - MSE: 4.70, R^2: 0.94
SGD Regression - MSE: 12.61, R^2: 0.84
GradienBoostingRegressor - MSE: 7.31, R^2: 0.91


In [5]:
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(get_base_data())
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 12.60, R^2: 0.84
Ridge Regression - MSE: 12.60, R^2: 0.84
Lasso Regression - MSE: 38.07, R^2: 0.52
Decision Tree Regression - MSE: 4.70, R^2: 0.94
SGD Regression - MSE: 12.61, R^2: 0.84
GradienBoostingRegressor - MSE: 7.31, R^2: 0.91


In [6]:
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(get_dynamic_data())
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 17.15, R^2: 0.80
Ridge Regression - MSE: 17.15, R^2: 0.80
Lasso Regression - MSE: 43.21, R^2: 0.50
Decision Tree Regression - MSE: 8.50, R^2: 0.90
SGD Regression - MSE: 17.16, R^2: 0.80
GradienBoostingRegressor - MSE: 11.22, R^2: 0.87


In [16]:
# These functions are specfically for testing how the models perform specifically on the surge data


def data_transform_surge(data):
    # Separate the features (X) and the target variable (y)
    X = data.drop('price', axis=1)
    y = data['price']

    # Preprocessing for numerical features
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical features
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])
    
    # Split the dataset into training (80%) and validation (20%) sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    # Save the indices of the validation set
    valid_indices = X_valid.index

    # Preprocessing of training data, fit_transform
    X_train_prepared = preprocessor.fit_transform(X_train)
    
    # Preprocessing of validation data, transform
    X_valid_prepared = preprocessor.transform(X_valid)

    # Check the shape after preprocessing
    X_train_prepared.shape, X_valid_prepared.shape

    final_columns = preprocessor.get_feature_names_out()
    return (X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns, valid_indices)


def run_models_surge(X_train_prepared, y_train, X_valid_prepared, y_valid, valid_indices):
    models = [
        ('Linear Regression', LinearRegression()),
        ('Ridge Regression', Ridge(random_state=0)),
        ('Lasso Regression', Lasso(random_state=0)),
        ('Decision Tree Regression', DecisionTreeRegressor(random_state=0)),
        ("SGD Regression", SGDRegressor(random_state=0)),
        #('RandomForestRegressor', RandomForestRegressor(random_state=0)),
        ('GradienBoostingRegressor', GradientBoostingRegressor(random_state= 0)),
        #('SVR', SVR())
    ]
    results = []
    for name, model in models:
        # ... [existing code] ...
        model.fit(X_train_prepared, y_train)
        # Filter X_valid and y_valid for surge_multiplier > 1
        surge_indices = data.loc[valid_indices, 'surge_multiplier'] > 1
        X_valid_surge = X_valid_prepared[surge_indices]
        y_valid_surge = y_valid[surge_indices]

        # Predict and evaluate on surge data
        y_pred_surge = model.predict(X_valid_surge)
        mse_surge = mean_squared_error(y_valid_surge, y_pred_surge)
        r2_surge = r2_score(y_valid_surge, y_pred_surge)
        results.append((name, mse_surge, r2_surge))
        print(f"{name} (Surge Data) - MSE: {mse_surge:.2f}, R^2: {r2_surge:.2f}")

    



In [17]:
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns,valid_indices = data_transform_surge(user_data)
run_models_surge(X_train_prepared, y_train, X_valid_prepared, y_valid, valid_indices)

(4151,) (127596,)
Linear Regression (Surge Data) - MSE: 60.98, R^2: 0.69
(4151,) (127596,)
Ridge Regression (Surge Data) - MSE: 60.98, R^2: 0.69
(4151,) (127596,)
Lasso Regression (Surge Data) - MSE: 142.10, R^2: 0.29
(4151,) (127596,)
Decision Tree Regression (Surge Data) - MSE: 19.28, R^2: 0.90
(4151,) (127596,)
SGD Regression (Surge Data) - MSE: 62.45, R^2: 0.69
(4151,) (127596,)
GradienBoostingRegressor (Surge Data) - MSE: 29.87, R^2: 0.85


In [27]:
# Create a DataFrame to view the feature importances
features_df = pd.DataFrame({'feature': final_columns, "importance": feature_importances})
features_df = features_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(features_df)


                                     feature  importance
34                  cat__car_type_Luxury SUV    0.560326
33                      cat__car_type_Luxury    0.204222
32                     cat__car_type_Base XL    0.067297
19                 cat__destination_Back Bay    0.018861
25                cat__destination_North End    0.016605
23       cat__destination_Financial District    0.014897
7                       cat__source_Back Bay    0.012584
11            cat__source_Financial District    0.012466
16                 cat__source_South Station    0.009621
12              cat__source_Haymarket Square    0.009144
28            cat__destination_South Station    0.007074
24         cat__destination_Haymarket Square    0.007062
35                      cat__car_type_Shared    0.006627
21        cat__destination_Boston University    0.006105
5                         cat__cab_type_Lyft    0.006102
13                     cat__source_North End    0.005195
27  cat__destination_Northeaste

In [12]:
user_data_surge = user_data
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(user_data_surge)
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 14.02, R^2: 0.84
Ridge Regression - MSE: 14.02, R^2: 0.84
Lasso Regression - MSE: 40.33, R^2: 0.54
Decision Tree Regression - MSE: 5.10, R^2: 0.94
SGD Regression - MSE: 14.07, R^2: 0.84
GradienBoostingRegressor - MSE: 8.27, R^2: 0.90


In [29]:
# Create a DataFrame to view the feature importances
features_df = pd.DataFrame({'feature': final_columns, "importance": feature_importances})
features_df = features_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(features_df)


                                     feature  importance
33                  cat__car_type_Luxury SUV    0.433046
4                      num__surge_multiplier    0.165543
32                      cat__car_type_Luxury    0.143077
30                        cat__car_type_Base    0.038293
22       cat__destination_Financial District    0.028351
15                 cat__source_South Station    0.025064
24                cat__destination_North End    0.019416
6                       cat__source_Back Bay    0.012812
10            cat__source_Financial District    0.012327
27            cat__destination_South Station    0.010119
20        cat__destination_Boston University    0.009949
25            cat__destination_North Station    0.009560
18                 cat__destination_Back Bay    0.009421
23         cat__destination_Haymarket Square    0.009057
12                     cat__source_North End    0.008862
16              cat__source_Theatre District    0.007723
3                           num

In [30]:
model.get_depth()

32

In [33]:
# Check for Overfitting By evaluating the Training Set
    
# Predict on the validation set
y_pred_train = model.predict(X_train_prepared)

# Evaluate the model
mse = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)
print(f"MSE: {mse:.2f}, R^2: {r2:.2f}")

MSE: 7.52, R^2: 0.96


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

data = get_base_data()
# Separate the features (X) and the target variable (y)
X = data.drop('price', axis=1)
y = data['price']


# Preprocessing for numerical features
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
numerical_transformer = StandardScaler()

# Preprocessing for categorical features
categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])
# Define the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', DecisionTreeRegressor(random_state=0))])

# Define the parameter grid to search
param_grid = {
    'regressor__max_depth': [3, 5, 10, 20, 30],
    'regressor__min_samples_split': [2, 4, 6]
}

# Set up the grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Predict and evaluate
y_pred_valid = best_model.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred_valid)
r2 = r2_score(y_valid, y_pred_valid)

print(f"Best model - MSE: {mse:.2f}, R^2: {r2:.2f}")


Best model - MSE: 4.70, R^2: 0.94


In [9]:
grid_search.best_params_

{'regressor__max_depth': 30, 'regressor__min_samples_split': 6}

In [20]:
import pandas as pd

# Assuming df is your DataFrame
df = get_cleaned_data()
# Filter for surge = 1 and calculate mean price
df_surge_1 = df[df['surge_multiplier'] == 1]
mean_price_surge_1 = df_surge_1.groupby(['source', 'destination', 'car_type'])['price'].mean()

# Filter for surge > 1 and calculate mean price
df_surge_greater_1 = df[df['surge_multiplier']  > 1]
mean_price_surge_greater_1 = df_surge_greater_1.groupby(['source', 'destination', 'car_type'])['price'].mean()


mean_price_surge_1 = mean_price_surge_1.reset_index()
mean_price_surge_greater_1 = mean_price_surge_greater_1.reset_index()

# Output the results
print("Mean Price for Surge = 1:\n", mean_price_surge_1)
print("\nMean Price for Surge > 1:\n", mean_price_surge_greater_1)


Mean Price for Surge = 1:
        source        destination                car_type      price
0    Back Bay  Boston University                    Base   8.002237
1    Back Bay  Boston University                 Base XL  13.023863
2    Back Bay  Boston University                  Luxury  16.054951
3    Back Bay  Boston University              Luxury SUV  25.960104
4    Back Bay  Boston University                  Shared   6.374568
..        ...                ...                     ...        ...
427  West End      South Station                 Base XL  14.167363
428  West End      South Station                  Luxury  18.524408
429  West End      South Station              Luxury SUV  28.076548
430  West End      South Station                  Shared   7.013661
431  West End      South Station  Wheel Chair Accessible   8.378272

[432 rows x 4 columns]

Mean Price for Surge > 1:
        source              destination    car_type      price
0    Back Bay        Boston University     

In [21]:
mean_price_surge_1

Unnamed: 0,source,destination,car_type,price
0,Back Bay,Boston University,Base,8.002237
1,Back Bay,Boston University,Base XL,13.023863
2,Back Bay,Boston University,Luxury,16.054951
3,Back Bay,Boston University,Luxury SUV,25.960104
4,Back Bay,Boston University,Shared,6.374568
...,...,...,...,...
427,West End,South Station,Base XL,14.167363
428,West End,South Station,Luxury,18.524408
429,West End,South Station,Luxury SUV,28.076548
430,West End,South Station,Shared,7.013661


In [22]:
mean_price_surge_greater_1


Unnamed: 0,source,destination,car_type,price
0,Back Bay,Boston University,Base,11.160377
1,Back Bay,Boston University,Base XL,17.674528
2,Back Bay,Boston University,Luxury,23.599057
3,Back Bay,Boston University,Luxury SUV,40.344340
4,Back Bay,Fenway,Base,10.849057
...,...,...,...,...
283,West End,Northeastern University,Luxury SUV,46.771429
284,West End,South Station,Base,10.592593
285,West End,South Station,Base XL,17.944444
286,West End,South Station,Luxury,23.444444


In [23]:
merged_surge_info = mean_price_surge_1.merge(mean_price_surge_greater_1, on = ['source', 'destination', 'car_type'] )

In [26]:
merged_surge_info['mean_diff']= merged_surge_info['price_x'] - merged_surge_info['price_y']

In [33]:
merged_surge_info['mean_diff'].mean()

-7.35146569832585