# 7. Putting it all together

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("../resources/car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
print(data.dtypes)
print(data.isna().sum())

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object
Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64


### Steps (all in one cell):
1. Fill missing data
2. Convert data to numbers
3. Build ML option

In [4]:
# set the seed
np.random.seed(42)

data = data.dropna(subset=["Price"])

# Change Colour and Make to numbers

categorical_features = ["Make", "Colour"]
categorical_trasformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# Clean up the doors column
door_feature = ["Doors"]
door_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value=4))
    ]
)

# clean up the odometer columns
numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))
    ]
)

# Combine previous pipelines into one pre-processing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_trasformer, categorical_features),
        ("door", door_transformer, door_feature),
        ("numeric", numeric_transformer, numeric_features)
    ]
)

# Creating a pre-processing and modelling pipeline
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestRegressor())
    ]
)

# Split the data

X = data.drop("Price", axis=1)
y = data["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2)

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.20234207803323734

### Adding `GridSearchCV` or `RandomisedCV` with our `Pipeline` and improving our model

In [7]:
pipe_grid = {
    "preprocessor__numeric__imputer__strategy": ["mean", "median"],
    # preprocessor is for the step, __num__ accesses the numerical part, then strategy is changing the strategy for the Imputer
    "model__n_estimators": [100, 500, 1000],
    # model is for the step, __n_estimators is for the n_estimators hyperparameter feature of the model
    "model__max_depth": [None, 5],
    # model is for the step, __max_depth is for the max_depth hyperparameter feature of the model
    "model__max_features": ["auto"],
    # model is for the step, __max_features is for the max_featrures hyperparameter feature of the model
    "model__min_samples_split": [2, 4]
    # model is for the step, __min_samples_split is for the min_samples_split hyperparameter feature of the model
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)
gs_model.score(X_test, y_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=a

[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numeric__imputer__strategy=median; total time=   1.5s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numeric__imputer__strategy=median; total time=   1.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numeric__imputer__strategy=median; total time=   1.9s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=500, preprocessor__numeric__imputer__strategy=median; total time=   1.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=1000, preprocessor__numeric__imputer__strategy=mean; total time=   3.8s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=4, model__n_estimators=

[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__numeric__imputer__strategy=mean; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__numeric__imputer__strategy=median; total time=   0.2s
[CV] END model__max_depth=5, model__max_features=auto, model__min_samples_split=4, model__n_estimators=100, preprocessor__nu

0.26910411377662813