In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Load the data
data = pd.read_csv("../Cleaning_Engineering/base_cleaned.csv")

# Show the first few rows of the dataframe
data.head()


Unnamed: 0.1,Unnamed: 0,distance,cab_type,source,destination,name,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
0,0,3.03,Lyft,Boston University,Theatre District,Lux Black XL,Luxury SUV,1,0,41.07,0.86,1014.39,,0.92,1.36,0,40,1.0,34.0
1,1,1.3,Uber,South Station,Theatre District,Black,Luxury,1,0,40.86,0.87,1014.39,,0.93,1.6,0,40,1.0,18.5
2,2,2.43,Lyft,Northeastern University,Beacon Hill,Lyft,Base,1,0,40.81,0.89,1014.35,,0.93,1.36,0,40,1.0,10.5
3,3,2.71,Uber,Theatre District,Fenway,UberXL,Base XL,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,32.0
4,4,2.71,Uber,Theatre District,Fenway,UberX,Base,1,0,40.8,0.87,1014.39,,0.93,1.55,0,40,1.0,19.5


In [6]:
data = data.drop(columns=['Unnamed: 0',"name"])
data["rain"].fillna(0.0, inplace=True)
test_data = data.iloc[int(data.shape[0]*.9):]
data = data.iloc[:int(data.shape[0]*.9)]



In [7]:
# Define the initial training set size and the test set size for the sliding window
initial_train_size = int(len(data) * 0.4)
test_window_size = int(len(data) * 0.1)

# Define the function to perform a single sliding window split
def sliding_window_split(data, start_index, train_size, test_size):
    end_index = start_index + train_size
    train_data = data[start_index:end_index]
    test_data = data[end_index:end_index + test_size]
    return train_data, test_data

# Perform the first sliding window split
sliding_train_data, sliding_test_data = sliding_window_split(
    data, start_index=0, train_size=initial_train_size, test_size=test_window_size)

# Display the shapes of the resulting datasets
(sliding_train_data.shape, sliding_test_data.shape)


((229671, 17), (57417, 17))

In [8]:
# Define the function to perform a single expanding window split
def expanding_window_split(data, start_index, end_index, test_size):
    train_data = data[start_index:end_index]
    test_data = data[end_index:end_index + test_size]
    return train_data, test_data

# Perform the first expanding window split
expanding_train_data, expanding_test_data = expanding_window_split(
    data, start_index=0, end_index=initial_train_size, test_size=test_window_size)

# Display the shapes of the resulting datasets
(expanding_train_data.shape, expanding_test_data.shape)


((229671, 17), (57417, 17))

In [18]:
def data_transform(data):

    # Separate the features (X) and the target variable (y)
    X = data.drop('price', axis=1)
    y = data['price']

    # Preprocessing for numerical features
    numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
    numerical_transformer = StandardScaler()

    # Preprocessing for categorical features
    categorical_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Bundle preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)])

    # Split the dataset into training (80%) and validation (20%) sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

    # Preprocessing of training data, fit_transform
    X_train_prepared = preprocessor.fit_transform(X_train)
    
    # Preprocessing of validation data, transform
    X_valid_prepared = preprocessor.transform(X_valid)

    # Check the shape after preprocessing
    X_train_prepared.shape, X_valid_prepared.shape
    return (X_train_prepared, y_train, X_valid_prepared, y_valid)


In [19]:
def run_models(X_train_prepared, y_train, X_valid_prepared, y_valid):
    #Define a list of models
    models = [
        ('Linear Regression', LinearRegression()),
        ('Ridge Regression', Ridge(random_state=0)),
        ('Lasso Regression', Lasso(random_state=0)),
        ('Decision Tree Regression', DecisionTreeRegressor(random_state=0)),
        ("SGD Regression", SGDRegressor(random_state=0)),
    ]

    # List to store results
    results = []

    # Loop through the list of models
    for name, model in models:
        # Fit the model
        model.fit(X_train_prepared, y_train)
        
        # Predict on the validation set
        y_pred_valid = model.predict(X_valid_prepared)
        
        # Evaluate the model
        mse = mean_squared_error(y_valid, y_pred_valid)
        r2 = r2_score(y_valid, y_pred_valid)
        
        # Store the results
        results.append((name, mse, r2))
        
        # Print the results
        print(f"{name} - MSE: {mse:.2f}, R^2: {r2:.2f}")

In [11]:
# Full Data Set 
X_train_prepared, y_train, X_valid_prepared, y_valid = data_transform(data)
run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

done with training set
Linear Regression - MSE: 7.56, R^2: 0.91
Ridge Regression - MSE: 7.56, R^2: 0.91
Lasso Regression - MSE: 30.94, R^2: 0.64
Decision Tree Regression - MSE: 7.80, R^2: 0.91
SGD Regression - MSE: 7.58, R^2: 0.91


In [17]:
# NO Surge Data
data_no_surge = data[data["surge_multiplier"] == 1.0]
X_train_prepared, y_train, X_valid_prepared, y_valid = data_transform(data_no_surge)
run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

done with training set
Linear Regression - MSE: 6.44, R^2: 0.92
Ridge Regression - MSE: 6.44, R^2: 0.92
Lasso Regression - MSE: 29.08, R^2: 0.63
Decision Tree Regression - MSE: 7.38, R^2: 0.91
SGD Regression - MSE: 6.45, R^2: 0.92


In [20]:
#Surge Data Only
data_surge = data[data["surge_multiplier"] != 1.0]
X_train_prepared, y_train, X_valid_prepared, y_valid = data_transform(data_surge)
run_models(X_train_prepared, y_train, X_valid_prepared, y_valid)

Linear Regression - MSE: 20.16, R^2: 0.90
Ridge Regression - MSE: 20.16, R^2: 0.90
Lasso Regression - MSE: 39.99, R^2: 0.79
Decision Tree Regression - MSE: 24.21, R^2: 0.87
SGD Regression - MSE: 20.26, R^2: 0.90


In [21]:
data_no_surge

Unnamed: 0,distance,cab_type,source,destination,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
0,3.03,Lyft,Boston University,Theatre District,Luxury SUV,1,0,41.07,0.86,1014.39,0.0,0.92,1.36,0,40,1.0,34.0
1,1.30,Uber,South Station,Theatre District,Luxury,1,0,40.86,0.87,1014.39,0.0,0.93,1.60,0,40,1.0,18.5
2,2.43,Lyft,Northeastern University,Beacon Hill,Base,1,0,40.81,0.89,1014.35,0.0,0.93,1.36,0,40,1.0,10.5
3,2.71,Uber,Theatre District,Fenway,Base XL,1,0,40.80,0.87,1014.39,0.0,0.93,1.55,0,40,1.0,32.0
4,2.71,Uber,Theatre District,Fenway,Base,1,0,40.80,0.87,1014.39,0.0,0.93,1.55,0,40,1.0,19.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574173,2.71,Uber,Back Bay,North End,Base,1,0,38.84,1.00,1005.53,0.0,0.90,10.93,0,30,1.0,11.0
574174,1.00,Uber,Haymarket Square,Financial District,Shared,1,0,38.86,1.00,1005.53,0.0,0.90,11.11,0,30,1.0,6.5
574175,1.44,Uber,Back Bay,Boston University,Base,1,0,38.84,1.00,1005.53,0.0,0.90,10.93,0,30,1.0,8.0
574176,1.44,Uber,Back Bay,Boston University,Shared,1,0,38.84,1.00,1005.53,0.0,0.90,10.93,0,30,1.0,8.0


In [22]:
data_surge

Unnamed: 0,distance,cab_type,source,destination,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
9,4.46,Lyft,Boston University,Financial District,Luxury,1,0,41.07,0.86,1014.39,0.0,0.92,1.36,0,40,1.75,47.5
10,4.46,Lyft,Boston University,Financial District,Luxury,1,0,41.07,0.86,1014.39,0.0,0.92,1.36,0,40,1.75,62.5
42,2.40,Lyft,Fenway,Beacon Hill,Base XL,1,0,40.84,0.88,1014.35,0.0,0.93,1.31,0,40,1.25,19.5
50,1.66,Lyft,Back Bay,Fenway,Luxury,1,0,41.04,0.87,1014.39,0.0,0.92,1.46,0,40,1.75,26.0
78,2.70,Lyft,Boston University,Beacon Hill,Luxury,1,0,40.66,0.86,1014.17,0.0,0.92,2.55,0,40,1.25,22.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574107,2.41,Lyft,Northeastern University,Beacon Hill,Base,1,0,38.65,1.00,1005.53,0.0,0.91,10.88,0,30,1.25,10.5
574153,1.50,Lyft,North Station,Haymarket Square,Luxury,1,0,38.77,1.00,1005.54,0.0,0.91,11.06,0,30,1.25,16.5
574155,3.53,Lyft,Boston University,North Station,Base XL,1,0,38.77,1.00,1005.57,0.0,0.90,10.69,0,30,1.50,26.0
574158,1.69,Lyft,Theatre District,North End,Base,1,0,38.74,1.00,1005.51,0.0,0.91,11.09,0,30,2.00,16.5


In [29]:
data_surge[(data_surge['cab_type'] == "Lyft") & 
                           (data_surge['source'] == "Fenway") & 
                           (data_surge['destination'] == "Beacon Hill") &
                           (data_surge['car_type'] == "Base")]["price"].mean()


12.476190476190476

In [32]:
data_surge[(data_surge['cab_type'] == "Lyft") & 
                           (data_surge['source'] == "Fenway") & 
                           (data_surge['destination'] == "Beacon Hill") &
                           (data_surge['car_type'] == "Base")]

Unnamed: 0,distance,cab_type,source,destination,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
3396,2.47,Lyft,Fenway,Beacon Hill,Base,1,1,40.97,0.98,1014.26,0.0000,0.92,1.37,0,40,1.25,10.5
22876,2.41,Lyft,Fenway,Beacon Hill,Base,1,0,44.32,0.91,1008.99,0.0030,0.87,9.36,1,40,1.50,13.5
23261,2.36,Lyft,Fenway,Beacon Hill,Base,1,0,44.32,0.91,1008.99,0.0030,0.87,9.36,1,40,1.25,11.0
24318,2.37,Lyft,Fenway,Beacon Hill,Base,1,0,44.49,0.97,1008.08,0.0330,0.88,9.10,1,40,2.00,16.5
26530,2.37,Lyft,Fenway,Beacon Hill,Base,1,0,44.51,1.00,1008.36,0.0360,0.88,9.42,1,40,1.50,13.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535746,2.41,Lyft,Fenway,Beacon Hill,Base,0,1,39.07,0.44,1022.78,0.0000,0.75,7.08,0,30,1.50,10.5
549503,2.40,Lyft,Fenway,Beacon Hill,Base,0,0,41.80,0.87,1020.11,0.0000,0.75,9.48,0,40,1.25,10.5
554655,2.39,Lyft,Fenway,Beacon Hill,Base,0,0,42.97,0.99,1016.12,0.0000,0.74,9.78,0,40,1.50,11.0
565629,2.47,Lyft,Fenway,Beacon Hill,Base,1,0,40.15,1.00,1010.91,0.0151,0.91,10.08,1,40,1.25,11.0


In [30]:
data_no_surge[(data_no_surge['cab_type'] == "Lyft") & 
                           (data_no_surge['source'] == "Fenway") & 
                           (data_no_surge['destination'] == "Beacon Hill") &
                           (data_no_surge['car_type'] == "Base")]["price"].mean()


9.52158273381295

In [35]:
data_no_surge[(data_no_surge['cab_type'] == "Lyft") & 
                           (data_no_surge['source'] == "Fenway") & 
                           (data_no_surge['destination'] == "Beacon Hill") &
                           (data_no_surge['car_type'] == "Base")]


Unnamed: 0,distance,cab_type,source,destination,car_type,weekday,rush_hour,temp,clouds,pressure,rain,humidity,wind,is_raining,temp_groups,surge_multiplier,price
428,2.38,Lyft,Fenway,Beacon Hill,Base,1,0,40.54,0.94,1013.75,0.000,0.92,2.78,0,40,1.0,9.0
800,2.39,Lyft,Fenway,Beacon Hill,Base,1,0,40.17,0.91,1013.78,0.000,0.94,2.67,0,40,1.0,10.5
1048,2.42,Lyft,Fenway,Beacon Hill,Base,1,0,40.26,1.00,1014.14,0.000,0.92,0.65,0,40,1.0,9.0
2574,2.40,Lyft,Fenway,Beacon Hill,Base,1,1,40.99,0.98,1014.34,0.000,0.93,1.78,0,40,1.0,9.0
4984,2.44,Lyft,Fenway,Beacon Hill,Base,1,0,40.65,1.00,1014.74,0.000,0.94,0.75,0,40,1.0,10.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564884,2.40,Lyft,Fenway,Beacon Hill,Base,1,0,41.55,0.96,1011.23,0.063,0.84,10.58,1,40,1.0,10.5
565200,2.41,Lyft,Fenway,Beacon Hill,Base,1,0,41.55,0.96,1011.23,0.063,0.84,10.58,1,40,1.0,9.0
567320,2.40,Lyft,Fenway,Beacon Hill,Base,1,0,39.48,1.00,1009.76,0.007,0.93,10.11,1,30,1.0,10.5
571700,2.42,Lyft,Fenway,Beacon Hill,Base,1,0,38.73,1.00,1007.25,0.000,0.91,11.04,0,30,1.0,9.0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=844e6b55-b111-410c-b292-5db3c2123d3f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>