# Modeling

In [None]:
import pickle
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, validation_curve
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.decomposition import PCA

In [None]:
df_train = pickle.load(open('df_train_prep.p', 'rb'))
df_test = pickle.load(open('df_test_prep.p', 'rb'))

In [None]:
#instantiate feature matrix and target vector for train set
features_train = df_train.drop("IsBadBuy", axis=1)
target_train = df_train.loc[:, "IsBadBuy"]

In [None]:
#instantiate feature matrix and target vector for test set
features_test = df_test.drop("IsBadBuy", axis=1)
target_test = df_test.loc[:, "IsBadBuy"]

In [None]:
features_train.info()

In [None]:
# baseline model: randomly predicting the same number of cases in each class
model_dummy = DummyClassifier(strategy='uniform', random_state=42)

cross_val_score(estimator=model_dummy,
                X=features_train, 
                y=target_train, 
                cv=3, 
                scoring='f1').mean()

In [None]:
features_train.select_dtypes('category').nunique()

In [None]:
features_train.loc[:, 'Transmission'].unique()

In [None]:
features_train.loc[:, 'Transmission'] = features_train.loc[:, 'Transmission'].replace({'AUTO':0, 'MANUAL':1})
features_train.loc[:, 'Transmission'] = features_train.loc[:, 'Transmission'].astype('category')

features_test.loc[:, 'Transmission'] = features_test.loc[:, 'Transmission'].replace({'AUTO':0, 'MANUAL':1})
features_test.loc[:, 'Transmission'] = features_test.loc[:, 'Transmission'].astype('category')

In [None]:
features_train.select_dtypes('category').columns

In [None]:
ohe_cols = ['Auction', 'Make', 'Trim', 'Color', 'WheelType',
            'Nationality', 'Size', 'TopThreeAmericanName', 'BYRNO', 'VNST']

In [None]:
features_train.select_dtypes('number').columns

In [None]:
num_cols = ['VehicleAge', 
            'VehOdo', 
            'MMRAcquisitionAuctionAveragePrice',
            'MMRAcquisitionAuctionCleanPrice', 
            'MMRAcquisitionRetailAveragePrice',
            'MMRAcquisitonRetailCleanPrice', 
            'MMRCurrentAuctionAveragePrice',
            'MMRCurrentAuctionCleanPrice', 
            'MMRCurrentRetailAveragePrice',
            'MMRCurrentRetailCleanPrice', 
            'VehBCost', 
            'WarrantyCost', 
            'PurchYear']

In [None]:
num_cols = features_train.select_dtypes('category').columns

In [None]:
# one-hot encoding categorial columns and standardising mmr colums to handle high correlations
col_transformer = ColumnTransformer(transformers=[("ohe", OneHotEncoder(handle_unknown="ignore"), ohe_cols)],
                                    remainder="passthrough",
                                    sparse_threshold=0)

scaler = StandardScaler()

In [None]:
# baseline model: decision tree
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(class_weight="balanced", random_state=42)

# tune hyperparameters
model_pipe_dt = Pipeline([("col_encoder", col_transformer),
                          ("model", model_dt)])

search_grid_dt =  {"model__max_depth": range(1, 5)}

grid_model_dt = GridSearchCV(estimator=model_pipe_dt,
                             param_grid=search_grid_dt,
                             cv=3,
                             scoring="f1")

grid_model_dt.fit(features_train, target_train)

print(grid_model_dt.best_estimator_)
print(grid_model_dt.best_score_)

In [None]:
# imp_series = pd.Series(data=grid_model_dt["model"].feature_importances_, index=features_train.columns)
# imp_series.sort_values()

In [None]:
# feature engineering

In [None]:
# feature importances and feature selection

In [None]:
# naive bayes
from sklearn.naive_bayes import GaussianNB

model_nb = GaussianNB()

pipe_nb = Pipeline([("col_encoder", col_transformer),
                    ("scaler", scaler),
                    ("model", model_nb)])

cross_val_score(estimator=pipe_nb, 
                X=features_train, 
                y=target_train, 
                cv=3, 
                scoring='f1', 
                n_jobs=-1).mean()

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression(class_weight="balanced", 
                              random_state=42, 
                              max_iter=1e4)

model_pipe_lr = Pipeline([("col_encoder", col_transformer),
                          ("scaler", scaler),
                          ("model", model_lr)])

search_grid_lr = {"model__C": [80, 90, 100, 110]}

grid_model_lr = GridSearchCV(estimator=model_pipe_lr,
                             param_grid=search_grid_lr,
                             cv=3,
                             scoring="f1")

grid_model_lr.fit(features_train, target_train)

print(grid_model_lr.best_estimator_)
print(grid_model_lr.best_score_)

In [None]:
# knn
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()

# tune hyperparameters
model_pipe_knn = Pipeline([("col_encoder", col_transformer),
                           ("scaler", scaler),
                           ("model", model_knn)])

search_grid_knn = {"model__n_neighbors": range(1, 11),
                   "model__weights": ['uniform', 'distance']}

grid_model_knn = GridSearchCV(estimator=model_pipe_knn,
                              param_grid=search_grid_knn,
                              cv=3,
                              scoring="f1")

grid_model_knn.fit(features_train, target_train)

print(grid_model_knn.best_estimator_)
print(grid_model_knn.best_score_)

In [None]:
# svm
from sklearn.svm import SVC

model_svm = SVC(class_weight="balanced",
                random_state=42,
                max_iter=1e4)

model_pipe_svm_poly = Pipeline([("col_encoder", col_transformer),
                               ("scaler", scaler),
                               ("model", model_svm)])

search_grid_svm = {"model__C": [0.01, 0.1, 1, 10, 100],
                   "model__kernel": ['linear', 'poly', 'rgb']}

grid_model_svm = GridSearchCV(estimator=model_pipe_svm_poly,
                              param_grid=search_grid_svm,
                              cv=3,
                              scoring="f1")

grid_model_svm.fit(features_train, target_train)

print(grid_model_svm.best_estimator_)
print(grid_model_svm.best_score_)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(class_weight="balanced", 
                                  random_state=42)

# tune hyperparameters
model_pipe_rf = Pipeline([("col_encoder", col_transformer),
                          ("model", model_rf)])

search_grid_rf = {"model__n_estimators": [100, 200, 300],
                  "model__max_depth": range(1, 5)}

grid_model_rf = GridSearchCV(estimator=model_pipe_rf,
                             param_grid=search_grid_rf,
                             cv=3,
                             scoring="f1")

grid_model_rf.fit(features_train, target_train)

print(grid_model_rf.best_estimator_)
print(grid_model_rf.best_score_)

In [None]:
# feed-forward ANN

Model selection

In [None]:
# model interpretation

In [None]:
# predictions on aim data