In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score



# Load the data
data = pd.read_csv("base_cleaned.csv")

# Show the first few rows of the dataframe
data.head()


Unnamed: 0.1,Unnamed: 0,distance,cab_type,source,destination,name,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
0,0,3.03,Lyft,Boston University,Theatre District,Lux Black XL,Luxury SUV,1,0,41.07,0.86,1014.39,,0.92,1.36,0,40,1.0,34.0
1,1,1.3,Uber,South Station,Theatre District,Black,Luxury,1,0,40.86,0.87,1014.39,,0.93,1.6,0,40,1.0,18.5
2,2,2.43,Lyft,Northeastern University,Beacon Hill,Lyft,Base,1,0,40.81,0.89,1014.35,,0.93,1.36,0,40,1.0,10.5
3,3,2.71,Uber,Theatre District,Fenway,UberXL,Base XL,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,32.0
4,4,2.71,Uber,Theatre District,Fenway,UberX,Base,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,19.5


In [15]:
def data_transform(data):

    # Separate the features (X) and the target variable (y)
    X = data.drop('price', axis=1)
    y = data['price']

    # Preprocessing for numerical features
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical features
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Split the dataset into training (80%) and validation (20%) sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    # Preprocessing of training data, fit_transform
    X_train_prepared = preprocessor.fit_transform(X_train)
    
    # Preprocessing of validation data, transform
    X_valid_prepared = preprocessor.transform(X_valid)

    # Check the shape after preprocessing
    X_train_prepared.shape, X_valid_prepared.shape

    final_columns = preprocessor.get_feature_names_out()
    return (X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns)


In [22]:
def run_models(X_train_prepared, y_train, X_valid_prepared, y_valid):
    #Define a list of models
    models = [
        ('Linear Regression', LinearRegression()),
        ('Ridge Regression', Ridge(random_state=0)),
        ('Lasso Regression', Lasso(random_state=0)),
        ('Decision Tree Regression', DecisionTreeRegressor(random_state=0)),
        ("SGD Regression", SGDRegressor(random_state=0)),
    ]

    # List to store results
    results = []
    # Loop through the list of models
    for name, model in models:
        # Fit the model
        model.fit(X_train_prepared, y_train)
        
        # Predict on the validation set
        y_pred_valid = model.predict(X_valid_prepared)
        
        # Evaluate the model
        mse = mean_squared_error(y_valid, y_pred_valid)
        r2 = r2_score(y_valid, y_pred_valid)
        
        # Store the results
        results.append((name, mse, r2))
        
        # Print the results

        print(f"{name} - MSE: {mse:.2f}, R^2: {r2:.2f}")
        if name == 'Decision Tree Regression':
            best_model = model

    return (best_model,best_model.feature_importances_)

In [17]:
data = data.drop(columns=['Unnamed: 0',"name"])
data["rain"].fillna(0.0, inplace=True)
test_data = data.iloc[int(data.shape[0]*.9):]
data = data.iloc[:int(data.shape[0]*.9)]



In [18]:
user_data = data[["cab_type", "source", "destination", "car_type", "weekday", "rush_hour", "is_raining", "temp_groups", "surge_multiplier", "price"]]

In [26]:
user_data_no_surge = user_data[user_data["surge_multiplier"] == 1.0]
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(user_data_no_surge)
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 12.38, R^2: 0.84
Ridge Regression - MSE: 12.38, R^2: 0.84
Lasso Regression - MSE: 37.51, R^2: 0.52
Decision Tree Regression - MSE: 4.60, R^2: 0.94
SGD Regression - MSE: 12.40, R^2: 0.84


In [27]:
# Create a DataFrame to view the feature importances
features_df = pd.DataFrame({'feature': final_columns, "importance": feature_importances})
features_df = features_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(features_df)


                                     feature  importance
34                  cat__car_type_Luxury SUV    0.560326
33                      cat__car_type_Luxury    0.204222
32                     cat__car_type_Base XL    0.067297
19                 cat__destination_Back Bay    0.018861
25                cat__destination_North End    0.016605
23       cat__destination_Financial District    0.014897
7                       cat__source_Back Bay    0.012584
11            cat__source_Financial District    0.012466
16                 cat__source_South Station    0.009621
12              cat__source_Haymarket Square    0.009144
28            cat__destination_South Station    0.007074
24         cat__destination_Haymarket Square    0.007062
35                      cat__car_type_Shared    0.006627
21        cat__destination_Boston University    0.006105
5                         cat__cab_type_Lyft    0.006102
13                     cat__source_North End    0.005195
27  cat__destination_Northeaste

In [28]:
user_data_surge = user_data[user_data["surge_multiplier"] != 1.0]
X_train_prepared, y_train, X_valid_prepared, y_valid, final_columns = data_transform(user_data_surge)
model, feature_importances = run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 35.75, R^2: 0.81
Ridge Regression - MSE: 35.75, R^2: 0.81
Lasso Regression - MSE: 68.82, R^2: 0.64
Decision Tree Regression - MSE: 19.28, R^2: 0.90
SGD Regression - MSE: 35.73, R^2: 0.81


In [29]:
# Create a DataFrame to view the feature importances
features_df = pd.DataFrame({'feature': final_columns, "importance": feature_importances})
features_df = features_df.sort_values(by='importance', ascending=False)

# Display the feature importances
print(features_df)


                                     feature  importance
33                  cat__car_type_Luxury SUV    0.433046
4                      num__surge_multiplier    0.165543
32                      cat__car_type_Luxury    0.143077
30                        cat__car_type_Base    0.038293
22       cat__destination_Financial District    0.028351
15                 cat__source_South Station    0.025064
24                cat__destination_North End    0.019416
6                       cat__source_Back Bay    0.012812
10            cat__source_Financial District    0.012327
27            cat__destination_South Station    0.010119
20        cat__destination_Boston University    0.009949
25            cat__destination_North Station    0.009560
18                 cat__destination_Back Bay    0.009421
23         cat__destination_Haymarket Square    0.009057
12                     cat__source_North End    0.008862
16              cat__source_Theatre District    0.007723
3                           num

In [30]:
model.get_depth()

32

In [33]:
# Check for Overfitting By evaluating the Training Set
    
# Predict on the validation set
y_pred_train = model.predict(X_train_prepared)

# Evaluate the model
mse = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)
print(f"MSE: {mse:.2f}, R^2: {r2:.2f}")

MSE: 7.52, R^2: 0.96


In [None]:
# Take deeper look at decision tree regression

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=844e6b55-b111-410c-b292-5db3c2123d3f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>