In [1]:
#Imports
import functions as func
import yaml


#Libraries
from sklearn.datasets import  fetch_california_housing
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



%matplotlib inline

db_locations = func.import_yaml()

#load df
df = pd.read_csv(db_locations['data_clean']['train'])

In [2]:
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,Automatic,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,Automatic,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,Gasoline,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,Automatic,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Automatic,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,Automatic,Black,Beige,None reported,Yes,97500


In [3]:
df.shape

(187765, 13)

In [4]:
# Drop model, id, and engine
df = df.drop(['id', 'model', 'engine', 'ext_col', 'int_col', 'clean_title', 'accident', 'transmission'], axis=1)

In [5]:
# List of categorical columns to apply dummy encoding
categorical_columns = ['brand', 'model_year', 'fuel_type']

# Apply pd.get_dummies()
df = pd.get_dummies(df, columns=categorical_columns)

# Convert the new boolean dummy columns to 1/0
df = df.astype({col: 'int' for col in df.select_dtypes(include='bool').columns})


df.head()

Unnamed: 0,milage,price,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,...,model_year_2019,model_year_2020,model_year_2021,model_year_2022,model_year_2023,model_year_2024,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,213000,4200,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,143250,4999,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,136731,13900,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,19500,45000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,7388,97500,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [6]:
df.shape

(187765, 97)

<br>
<br>

**Train Test Split**

In [8]:
features = df.drop(columns = ["price"]) # Features (everything except 'price')
target = df["price"]                    # Target variable ('price')

In [9]:
# perform the train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [10]:
# check the shape of the resulting sets
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (150212, 96)
y_train shape: (150212,)
X_test shape: (37553, 96)
y_test shape: (37553,)


<br>
<br>

**Standardization and Normalization**

In [12]:
# Create an instance of the normalizer
normalizer = MinMaxScaler()

In [13]:
normalizer.fit(X_train)

In [14]:
X_train_norm = normalizer.transform(X_train)

X_test_norm = normalizer.transform(X_test)

In [15]:
X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns)
X_train_norm.head()

Unnamed: 0,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,...,model_year_2019,model_year_2020,model_year_2021,model_year_2022,model_year_2023,model_year_2024,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,0.304767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.188676,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.126451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.217091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.056135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)
X_test_norm.head()

Unnamed: 0,milage,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,...,model_year_2019,model_year_2020,model_year_2021,model_year_2022,model_year_2023,model_year_2024,fuel_type_Diesel,fuel_type_Electric,fuel_type_Gasoline,fuel_type_Hybrid
0,0.602371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.237837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.063966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.208447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.059126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Bagging and Pasting¶

In [18]:
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 1000)

In [None]:
bagging_reg.fit(X_train_norm, y_train)

Evaluate model's performance

In [None]:
pred = bagging_reg.predict(X_test_norm)

print(f"MAE {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score {bagging_reg.score(X_test_norm, y_test): .2f}")

### Random Forest

In [None]:
# Initialize a Random Forest
forest = RandomForestRegressor(n_estimators=100,
                             max_depth=20)

In [None]:
# Training the model
forest.fit(X_train_norm, y_train)

In [None]:
# Evaluate the model

pred = forest.predict(X_test_norm)

print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {forest.score(X_test_norm, y_test): .2f}")

### Adaptive Boosting

In [None]:
# Initialize an  Adaptive Boost model

ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

In [None]:
# Training the model
ada_reg.fit(X_train_norm, y_train)

Evaluate the model

In [None]:
pred = ada_reg.predict(X_test_norm)

print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"RMSE, {mean_squared_error(pred, y_test, squared=False): .2f}")
print(f"R2 score, {ada_reg.score(X_test_norm, y_test): .2f}")

### Decision Tree

In [None]:
# Initialize a Decision Tree instance
tree = DecisionTreeRegressor(max_depth=10)

In [None]:
# Training the model
tree.fit(X_train_norm, y_train)

In [None]:
# Evaluate the model

y_pred_test_dt = tree.predict(X_test_norm)

print(f"MAE, {mean_absolute_error(y_pred_test_dt, y_test): .2f}")
print(f"RMSE, {mean_squared_error(y_pred_test_dt, y_test, squared=False): .2f}")
print(f"R2 score, {tree.score(X_test_norm, y_test): .2f}")

In [None]:
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import graphviz

tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X_train_norm, y_train)


dot_data = export_graphviz(tree, out_file="tree.dot", filled=True, rounded=True, feature_names=X_train_norm_df.columns)

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
### GRADIENT BOOSTING

In [None]:
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)


In [None]:
gb_reg.fit(X_train_norm, y_train)

In [None]:
pred = gb_reg.predict(X_test_norm)

print("MAE", mean_absolute_error(pred, y_test))
print("RMSE", mean_squared_error(pred, y_test, squared=False))
print("R2 score", gb_reg.score(X_test_norm, y_test))