In [25]:
# DATA MANIPULATION
import pandas as pd
pd.set_option("display.max_columns",None) # Show all columns of a Pandas DataFrame

# DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns

# STATISTICS
from statsmodels.graphics.gofplots import qqplot
# This function plots your sample distribution against a Normal distribution,
# to check whether your sample is normally distributed or not

# PIPELINE AND COLUMNTRANSFORMER
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import set_config; set_config(display="diagram")


# TRAIN/TEST SPLIT
from sklearn.model_selection import train_test_split

# SCALERS
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

# ENCODER
from sklearn.preprocessing import OneHotEncoder

# LINEAR MODELS
from sklearn.linear_model import Ridge, ElasticNet

# TREES AND ENSEMBLE METHODS
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

# METRICS
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score

#GridSearchCV
from sklearn.model_selection import GridSearchCV

In [26]:
df = pd.read_csv("../data/raw/data.csv")
# Drop irrelevant columns
df.drop(columns = ['car_ID', 'CarName'], inplace = True)
df["cylindernumber"] = df["cylindernumber"].map({"four":4,
                                                 "six":6,
                                                 "five":5,
                                                 "eight":8,
                                                 "two":2,
                                                 "twelve":12,
                                                 "three":3})

In [27]:
# Define features and target

X = df.drop(columns = ["price"])
y = df["price"]

In [28]:
X_cat = X.select_dtypes(include=['object'])
X_num = X.select_dtypes(exclude = ['object'])

In [29]:
# Column Transformer with the 3 Scaler

scalers = ColumnTransformer(
                                [
                                    ("robust_scaler", RobustScaler(), features_robust),
                                    ("standard_scaler", StandardScaler(), features_standard),
                                    ("minmax_scaler", MinMaxScaler(), features_minmax)
                                ])

In [30]:
# Pipeline (if we had NaN, we could add an Imputer)

num_transformer = make_pipeline(

                                scalers
                                            )

num_transformer

In [31]:
# Define the numerical and categorical features
num_features = X_num.columns
cat_features  = X_cat.columns
 
# Define the transformers for numerical and categorical features
num_transformer = make_pipeline(
                                scalers,
                            )

cat_transformer = make_pipeline(
                                OneHotEncoder(sparse_output=False, 
                                              handle_unknown='ignore')
                            )


# Create the preprocessor using `make_column_transformer`
preprocessor = make_column_transformer(
                                        (num_transformer, num_features),
                                        (cat_transformer, cat_features),
                                    )

preprocessor

In [32]:
# # Fitting to check result

fully_preprocessed_dataset = pd.DataFrame(preprocessor.fit_transform(X))
fully_preprocessed_dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
0,-1.063291,0.0,0.227273,-2.033333,0.000,0.347826,-0.272727,1.743470,-0.426521,-0.844782,-2.020417,-0.014566,0.519071,-0.546059,0.346939,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.063291,0.0,0.227273,-2.033333,0.000,0.347826,-0.272727,1.743470,-0.426521,-0.844782,-2.020417,-0.014566,0.519071,-0.546059,0.346939,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.316456,2.0,0.727273,0.600000,0.000,1.282609,-0.454545,0.133509,-0.231513,-0.190566,-0.543527,0.514882,-2.404880,-0.691627,0.346939,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.354430,0.0,-0.250000,0.366667,1.250,0.152174,0.000000,0.938490,0.207256,0.136542,0.235942,-0.420797,-0.517266,-0.109354,0.551020,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.303797,1.0,0.363636,0.366667,-1.250,0.434783,-0.545455,0.938490,0.207256,0.230001,0.235942,0.516807,-0.517266,-1.273900,0.551020,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,1.531646,0.0,0.477273,-0.466667,0.625,0.413043,-0.090909,-1.476452,1.198549,1.398245,0.728239,0.763241,1.666445,-0.400490,0.510204,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
201,1.531646,0.0,0.477273,-0.466667,-0.375,1.413043,-0.454545,-1.476452,1.198549,1.351515,0.728239,0.949992,1.666445,-0.837195,0.469388,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
202,1.531646,2.0,1.204545,-1.400000,-0.250,0.847826,-0.545455,-1.476452,1.198549,1.398245,0.728239,0.878757,0.926204,-1.128332,0.551020,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
203,1.531646,2.0,0.568182,0.366667,17.500,0.239130,0.181818,-1.476452,1.198549,1.398245,0.728239,1.273437,-1.183483,-0.546059,0.265306,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


process is done, now modelling

In [33]:
def df_regression_models(regression_model):
    
    piped_regressor = make_pipeline(preprocessor, regression_model)
    
    return piped_regressor

In [34]:
models_names = ["Ridge",
                "ElasticNet",
                "DecisionTreeRegressor",
                "RandomForestRegressor",
                "AdaBoostRegressor",
                "GradientBoostingRegressor",
]

In [36]:
%%time


random_seed = 42

models = [
    Ridge(alpha=1.0),
    ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=10000),  # Increase max_iter
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor(),
]

# Now, split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)


# Lists to store the test scores and MSE for each model
different_test_scores = []
different_mse_scores = []

for model_name, model in zip(models_names, models):
    # Create the model pipeline
    temp_piped_regressor = df_regression_models(model)

    # Fit the model on the training data
    temp_piped_regressor.fit(X_train, y_train)

    # Predict on the test data
    y_pred = temp_piped_regressor.predict(X_test)

    # Calculate the R-squared score
    test_score = r2_score(y_test, y_pred)

    # Calculate the Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)

    # Append the test scores and MSE to the respective lists
    different_test_scores.append({'Model Name': model_name, 'R2': test_score})
    different_mse_scores.append({'Model Name': model_name, 'MSE': mse})

# Create dataframes to store the test scores and MSE
comparing_regression_models_df = pd.DataFrame(different_test_scores)
comparing_regression_models_df_mse = pd.DataFrame(different_mse_scores)

# Sort the dataframes by test_score and MSE in descending order
comparing_regression_models_df = comparing_regression_models_df.sort_values(by='R2', ascending=False)
comparing_regression_models_df_mse = comparing_regression_models_df_mse.sort_values(by='MSE', ascending=True)

# Display the results
print("R-squared scores:")
print(round(comparing_regression_models_df, 2))
print("\nMean Squared Errors:")
print(round(comparing_regression_models_df_mse, 2))

R-squared scores:
                  Model Name    R2
3      RandomForestRegressor  0.96
5  GradientBoostingRegressor  0.93
4          AdaBoostRegressor  0.91
2      DecisionTreeRegressor  0.90
0                      Ridge  0.84
1                 ElasticNet  0.79

Mean Squared Errors:
                  Model Name          MSE
3      RandomForestRegressor   3399224.75
5  GradientBoostingRegressor   5777241.91
4          AdaBoostRegressor   6918812.37
2      DecisionTreeRegressor   7602612.39
0                      Ridge  12742125.31
1                 ElasticNet  16462985.37
CPU times: total: 1.3 s
Wall time: 724 ms


cross-validation

In [37]:
# Cross Validation


# Lists to store the cross-validation scores for each model
different_test_scores_cv = []

# Perform 5-fold cross-validation for each regression model
for model_name, model in zip(models_names, models):
    # Create the model pipeline
    temp_piped_regressor = df_regression_models(model)
    
    # Calculate cross-validation scores using the entire dataset (X, y)
    scores = cross_val_score(temp_piped_regressor, X, y, cv=5, scoring='r2')

    
    # Calculate the mean of cross-validation scores for the current model
    mean_cv_score = scores.mean()
    
    # Append the mean cross-validation score to the list
    different_test_scores_cv.append(mean_cv_score)

# Create a DataFrame to compare the models based on their cross-validation scores
comparing_regression_models_df_cv = pd.DataFrame(list(zip(models_names, different_test_scores_cv)),
                                                  columns=['Model Name', 'cross_val_R2'])

# Sort the DataFrame by cross-validation score in descending order
comparing_regression_models_df_cv = comparing_regression_models_df_cv.sort_values(by="cross_val_R2", ascending=False)

# Display the DataFrame with the results
round(comparing_regression_models_df_cv, 2)

Unnamed: 0,Model Name,cross_val_R2
1,ElasticNet,0.57
5,GradientBoostingRegressor,0.49
3,RandomForestRegressor,0.39
4,AdaBoostRegressor,0.3
0,Ridge,0.28
2,DecisionTreeRegressor,0.21
