In [2]:
#### Compare Regression Models ####

# Importing necessary libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
from plotnine import ggplot, aes, geom_col, xlab, ylab

# Import data
carseats_train = pd.read_csv("carseats_train.csv")
carseats_train = carseats_train.dropna()

carseats_test = pd.read_csv("carseats_test.csv")
carseats_test = carseats_test.dropna()

# Distinguish training and test data
X_train = carseats_train.drop(['Sales', 'Sales_cat'], axis=1)
y_train = carseats_train['Sales']

X_test = carseats_test.drop(['Sales', 'Sales_cat'], axis=1)
y_test = carseats_test['Sales']

# Model piece to handle categorical variables
ct = ColumnTransformer(
  [
    ("dummify", OneHotEncoder(sparse_output = False), ["US", "Urban", "Shelf_Location"]),
  ],
  remainder = "passthrough"
)

## Linear Regression with all explanatory variables
lr_pipeline = Pipeline(
  [("preprocessing", ct),
   ("lr", LinearRegression(fit_intercept = True))]
).set_output(transform="pandas")

lr_fitted = lr_pipeline.fit(X_train, y_train)

y_pred = lr_fitted.predict(X_test)
r2_1 = r2_score(y_test, y_pred)

## Decision Tree with Max Depth of 10
tree_pipeline = Pipeline(
  [("preprocessing", ct),
    ("tree", DecisionTreeRegressor(max_depth=10))]
).set_output(transform="pandas")


tree_fitted = tree_pipeline.fit(X_train, y_train)

y_pred = tree_fitted.predict(X_test)

r2_2 = r2_score(y_test, y_pred)

## Random Forest with Min n of 15
rf_pipeline = Pipeline(
  [("preprocessing", ct),
    ("rf", RandomForestRegressor(min_samples_split=15))]
).set_output(transform="pandas")


rf_fitted = rf_pipeline.fit(X_train, y_train)

y_pred = rf_fitted.predict(X_test)

r2_3 = r2_score(y_test, y_pred)


## kNN with k = 10
knn_pipeline = Pipeline(
  [("preprocessing", ct),
   ("knn", KNeighborsRegressor(n_neighbors=10))]
).set_output(transform="pandas")

knn_fitted = knn_pipeline.fit(X_train, y_train)

y_pred = knn_pipeline.predict(X_test)

r2_4 = r2_score(y_test, y_pred)

# From summary output above
d = {'method': ["MLR", "DT", "RF", "kNN"], 'r2': [r2_1, r2_2, r2_3, r2_4] }
results = pd.DataFrame(data = d)

print(results)

  method        r2
0    MLR  0.854617
1     DT  0.239659
2     RF  0.668748
3    kNN  0.121732
