In [None]:
# Import Libraries
import pandas as pd
import numpy as np

In [None]:
# create dataframe from csv file
data = pd.read_csv('data/bmw.csv')

In [None]:
# Display unique values per column
data.nunique()

model             24
year              25
price           3777
transmission       3
mileage         8086
fuelType           5
tax               38
mpg              102
engineSize        17
dtype: int64

In [None]:
# Display columns with missing data
# data.isna().sum() This will work just the same as below
missing_values = [np.nan, "", " ", None]

data.isin(missing_values).mean().sort_values(ascending=False) * 100

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

In [None]:
# Predict the price column
# Separating out the Target Column
X = data.drop(columns="price")
y = data["price"]

In [None]:
# Feature Engineering
# Create a dictionary that will Classify every model as a car type
car_type = {'5 Series':'sedan',
 '6 Series':'coupe',
 '1 Series':'coupe',
 '7 Series':'sedan',
 '2 Series':'coupe',
 '4 Series':'coupe',
 'X3':'suv',
 '3 Series':'sedan',
 'X5':'suv',
 'X4':'suv',
 'i3':'electric',
 'X1':'suv',
 'M4':'sports',
 'X2':'suv',
 'X6':'suv',
 '8 Series':'coupe',
 'Z4':'convertible',
 'X7':'suv',
 'M5':'sports',
 'i8':'electric',
 'M2':'sports',
 'M3':'sports',
 'M6':'sports',
 'Z3':'convertible'}

# Feature Engineering
# We're going to add a classification that I manually put together
# .strip removes whitespace
X["model"] = X["model"].str.strip()
X["car_type"] = X["model"].map(car_type)
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,car_type
0,5 Series,2014,Automatic,67068,Diesel,125,57.6,2.0,sedan
1,6 Series,2018,Automatic,14827,Petrol,145,42.8,2.0,coupe
2,5 Series,2016,Automatic,62794,Diesel,160,51.4,3.0,sedan
3,1 Series,2017,Automatic,26676,Diesel,145,72.4,1.5,coupe
4,7 Series,2014,Automatic,39554,Diesel,160,50.4,3.0,sedan
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,Automatic,40818,Diesel,150,54.3,2.0,suv
10777,5 Series,2016,Automatic,42947,Diesel,125,60.1,2.0,sedan
10778,3 Series,2017,Manual,25468,Petrol,200,42.8,2.0,sedan
10779,1 Series,2014,Automatic,45000,Diesel,30,64.2,2.0,coupe


In [None]:
# Data Encoding
# Drop the first variable to avoid Dummy Variable Trap
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,car_type_coupe,car_type_electric,car_type_sedan,car_type_sports,car_type_suv
0,2014,67068,125,57.6,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2018,14827,145,42.8,2.0,0,0,0,0,1,...,0,0,0,0,1,1,0,0,0,0
2,2016,62794,160,51.4,3.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,2017,26676,145,72.4,1.5,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2014,39554,160,50.4,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,40818,150,54.3,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10777,2016,42947,125,60.1,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
10778,2017,25468,200,42.8,2.0,0,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
10779,2014,45000,30,64.2,2.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
# MinMaxScaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,transmission_Semi-Auto,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,car_type_coupe,car_type_electric,car_type_sedan,car_type_sports,car_type_suv
0,0.750000,0.313399,0.215517,0.111971,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.916667,0.069281,0.250000,0.080163,0.303030,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.833333,0.293427,0.275862,0.098646,0.454545,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.875000,0.124650,0.250000,0.143778,0.227273,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.750000,0.184828,0.275862,0.096497,0.454545,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,0.833333,0.190735,0.258621,0.104879,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10777,0.833333,0.200683,0.215517,0.117344,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10778,0.875000,0.119005,0.344828,0.080163,0.303030,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
10779,0.750000,0.210277,0.051724,0.126155,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
### Linear Regression Algorithm Model ###

### Import our Machine Learning Algorithm
from sklearn.linear_model import LinearRegression
### Import our metric
from sklearn.metrics import mean_absolute_error


# Create a model object
linear_regressor = LinearRegression()

# Fit the object to our data (this is the training phase)
linear_regressor.fit(X_train, y_train)

# Create predictions with your newly trained model
linear_predictions = linear_regressor.predict(X_test)

# Measure the efficacy of your algorithm using your metric
mean_absolute_error(y_test, linear_predictions)

2729.8488173558085

In [None]:
# Percent above or below the mean price
mean_absolute_error(y_test, linear_predictions) / y_test.mean()

0.11794887790512538

In [None]:
### Random Forest Algorithm Model ###

### Import our Machine Learning Algorithm
from sklearn.ensemble import RandomForestRegressor
### Import our metric
from sklearn.metrics import mean_absolute_error


# Create a model object
random_forest_regressor = RandomForestRegressor(n_estimators=1000)

# Fit the object to our data (this is the training phase)
random_forest_regressor.fit(X_train, y_train)

# Create predictions with your newly trained model
random_forest_predictions = random_forest_regressor.predict(X_test)

# Measure the efficacy of your algorithm using your metric
mean_absolute_error(y_test, random_forest_predictions)

1556.794370878494

In [None]:
# Percent off from the mean price
mean_absolute_error(y_test, random_forest_predictions) / y_test.mean()

0.06726458549891215

In [None]:
### XGBoost Algorithm Model ###
! pip install xgboost

### Import our Machine Learning Algorithm
from xgboost import XGBRegressor
### Import our metric
from sklearn.metrics import mean_absolute_error


# Create a model object
boost_model = XGBRegressor()

# Fit the object to our data (this is the training phase)
boost_model.fit(X_train, y_train)

# Create predictions with your newly trained model
boost_predictions = boost_model.predict(X_test)

# Measure the efficacy of your algorithm using your metric
mean_absolute_error(y_test, boost_predictions)

Collecting xgboost
  Downloading xgboost-1.5.1-py3-none-manylinux2014_x86_64.whl (173.5 MB)
[K     |████████████████████████████████| 173.5 MB 22 kB/s 
Installing collected packages: xgboost
Successfully installed xgboost-1.5.1
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


1560.7555908746465

In [None]:
# Percent away from the mean price
mean_absolute_error(y_test, boost_predictions) / y_test.mean()

0.06743573836668673

In [None]:
### Hyperparamter Tuning ###
from sklearn.model_selection import GridSearchCV
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

n_estimators = [1500, 1600]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [80, 90]
# Minimum number of samples required to split a node
min_samples_split = [5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [True]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [1500, 1600],
 'max_features': ['auto'],
 'max_depth': [80, 90],
 'min_samples_split': [5],
 'min_samples_leaf': [1],
 'bootstrap': [True]}

In [None]:
# run the Random Forest Algorithm using the HyperParameters
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv =, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


GridSearchCV(cv=2, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90],
                         'max_features': ['auto'], 'min_samples_leaf': [1],
                         'min_samples_split': [5],
                         'n_estimators': [1500, 1600]},
             verbose=2)

In [None]:
# Find the best paremters
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1600}

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

perfect_random_forest = RandomForestRegressor(n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features='auto', max_depth=90, bootstrap=True)
perfect_random_forest.fit(X_train, y_train)

perfect_random_forest_predictions = perfect_random_forest.predict(X_test)

mean_absolute_error(y_test, perfect_random_forest_predictions)

1546.1240275881644

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cc843f04-8de7-4926-8946-57fb094cb8ec' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>