## Import libraries

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive

from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix,\
        accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier,StackingClassifier,ExtraTreesClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB

from time import time
from datetime import timedelta

import seaborn as sns
from matplotlib import pyplot as plt

sns.set()


## Load dataset

In [None]:
drive.mount('/content/drive')
df=pd.read_csv('/content/drive/MyDrive/ML/Machine Learning Project. GALINDO - PAGLIA/kickafterpreprocess.csv', sep=";")

Mounted at /content/drive


In [None]:
df.columns

Index(['IsBadBuy', 'Auction', 'VehicleAge', 'SubModel', 'Color',
       'Transmission', 'WheelType', 'VehOdo', 'Nationality', 'Size', 'VNST',
       'VehBCost', 'IsOnlineSale', 'WarrantyCost', 'season', 'age_cat',
       'VehOdo_cat', 'VehBCost_cat', 'WarrantyCost_cat'],
      dtype='object')

In [None]:
df.isna().sum()

IsBadBuy            0
Auction             0
VehicleAge          0
SubModel            0
Color               0
Transmission        0
WheelType           0
VehOdo              0
Nationality         0
Size                0
VNST                0
VehBCost            0
IsOnlineSale        0
WarrantyCost        0
season              0
age_cat             0
VehOdo_cat          0
VehBCost_cat        0
WarrantyCost_cat    0
dtype: int64

In [None]:
df

Unnamed: 0,IsBadBuy,Auction,VehicleAge,SubModel,Color,Transmission,WheelType,VehOdo,Nationality,Size,VNST,VehBCost,IsOnlineSale,WarrantyCost,season,age_cat,VehOdo_cat,VehBCost_cat,WarrantyCost_cat
0,0,ADESA,3,SEDAN,RED,AUTO,Alloy,89046,ASIAN,MEDIUM,EAST,7100.0,no,1113,winter,0to3_years,>82383,6710-7900,837-1155
1,0,ADESA,5,CAB,WHITE,AUTO,Alloy,93593,AMERICAN,TRUCK,EAST,7600.0,no,1053,winter,3to6_years,>82383,6710-7900,837-1155
2,0,ADESA,4,SEDAN,MAROON,AUTO,Covers,73807,AMERICAN,MEDIUM,EAST,4900.0,no,1389,winter,3to6_years,73322-82383,0-5440,1155-1623
3,0,ADESA,5,SEDAN,SILVER,AUTO,Alloy,65617,AMERICAN,COMPACT,EAST,4100.0,no,630,winter,3to6_years,61815-73322,0-5440,0-837
4,0,ADESA,4,COUPE,SILVER,MANUAL,Covers,69367,AMERICAN,COMPACT,EAST,4000.0,no,1020,winter,3to6_years,61815-73322,0-5440,837-1155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69727,0,ADESA,3,SEDAN,GREEN,AUTO,Covers,68127,AMERICAN,OTHER,EAST,4200.0,no,1215,fall,0to3_years,61815-73322,0-5440,1155-1623
69728,0,ADESA,7,CAB,GOLD,AUTO,Alloy,93744,AMERICAN,TRUCK,EAST,6200.0,no,1353,winter,6to9_years,>82383,5440-6710,1155-1623
69729,0,ADESA,2,SEDAN,SILVER,AUTO,Alloy,74407,ASIAN,MEDIUM,EAST,8200.0,no,803,winter,0to3_years,73322-82383,>7900,0-837
69730,0,ADESA,5,SUV,SILVER,AUTO,Alloy,82563,AMERICAN,SUV,EAST,7000.0,no,1243,winter,3to6_years,>82383,6710-7900,1155-1623


In [None]:
#pd.Series(np.where(df1.IsBadBuy.values == 'yes', 1, 0),
          #df1.index)
df = df.drop(columns=['VehicleAge','VehOdo','VehBCost','WarrantyCost'])
      

In [None]:
# Get dummies 
Auction_dummies =pd.get_dummies(df['Auction'], prefix='Auction')
SubModel_dummies = pd.get_dummies(df['SubModel'], prefix='SubModel')
Color_dummies = pd.get_dummies(df['Color'], prefix='Color')
Transmission_dummies = pd.get_dummies(df['Transmission'], prefix='Transmission')
WheelType_dummies = pd.get_dummies(df['WheelType'], prefix='WheelType')
Nationality_dummies = pd.get_dummies(df['Nationality'], prefix='Nationality')
Size_dummies = pd.get_dummies(df['Size'], prefix='Size')
VNST_dummies = pd.get_dummies(df['VNST'], prefix='VNST')
IsOnlineSale_dummies = pd.get_dummies(df['IsOnlineSale'], prefix='IsOnlineSale')
season_dummies = pd.get_dummies(df['season'], prefix='season')
age_cat_dummies = pd.get_dummies(df['age_cat'], prefix='age_cat')
VehOdo_cat_dummies = pd.get_dummies(df['VehOdo_cat'], prefix='VehOdo_cat')
VehBCost_cat_dummies = pd.get_dummies(df['VehBCost_cat'], prefix='VehBCost_cat')
WarrantyCost_cat_dummies = pd.get_dummies(df['WarrantyCost_cat'], prefix='WarrantyCost_cat')

In [None]:
df = pd.concat([df['IsBadBuy'],
                Auction_dummies, 
                SubModel_dummies, 
                Color_dummies,
                Transmission_dummies,
                WheelType_dummies, 
                Nationality_dummies, 
                Size_dummies, 
                VNST_dummies,
                IsOnlineSale_dummies,
                season_dummies,
                age_cat_dummies,
                VehOdo_cat_dummies,
                VehBCost_cat_dummies,
                WarrantyCost_cat_dummies], axis='columns')
df.columns

Index(['IsBadBuy', 'Auction_ADESA', 'Auction_MANHEIM', 'Auction_OTHER',
       'SubModel_CAB', 'SubModel_COUPE', 'SubModel_CUV', 'SubModel_MINIVAN',
       'SubModel_OTHER', 'SubModel_PASSENGER', 'SubModel_SEDAN',
       'SubModel_SPORT', 'SubModel_SUV', 'SubModel_WAGON', 'Color_BEIGE',
       'Color_BLACK', 'Color_BLUE', 'Color_BROWN', 'Color_GOLD', 'Color_GREEN',
       'Color_GREY', 'Color_MAROON', 'Color_ORANGE', 'Color_OTHER',
       'Color_PURPLE', 'Color_RED', 'Color_SILVER', 'Color_WHITE',
       'Color_YELLOW', 'Transmission_AUTO', 'Transmission_MANUAL',
       'WheelType_Alloy', 'WheelType_Covers', 'WheelType_Special',
       'Nationality_AMERICAN', 'Nationality_ASIAN', 'Nationality_OTHER',
       'Size_COMPACT', 'Size_LARGE', 'Size_MEDIUM', 'Size_OTHER', 'Size_SUV',
       'Size_TRUCK', 'Size_VAN', 'VNST_CENTRAL', 'VNST_EAST', 'VNST_WEST',
       'IsOnlineSale_no', 'IsOnlineSale_yes', 'season_fall', 'season_spring',
       'season_summer', 'season_winter', 'age_cat_0to3_year

In [None]:
from sklearn.model_selection import train_test_split
y=df['IsBadBuy']
X=df.drop(columns='IsBadBuy')
X_learn, X_test, y_learn, y_test = train_test_split(X, y, test_size=0.20, random_state=2022)
X_train, X_val, y_train, y_val = train_test_split(X_learn, y_learn, test_size=0.20, random_state=2022)

# Modelling

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
from sklearn.datasets import make_circles
from matplotlib.colors import Normalize


# Función para visualizar un conjunto de datos en 2D
def plot_data(X, y):
    y_unique = np.unique(y)
    colors = pl.cm.rainbow(np.linspace(0.0, 1.0, y_unique.size))
    for this_y, color in zip(y_unique, colors):
        this_X = X[y == this_y]
        pl.scatter(this_X[:, 0], this_X[:, 1],  c=color,
                    alpha=0.5, edgecolor='k',
                    label="Class %s" % this_y)
    pl.legend(loc="best")
    pl.title("Data")
    
# Función para visualizar de la superficie de decisión de un clasificador
def plot_decision_region(X, pred_fun):
    min_x = np.min(X[:, 0])
    max_x = np.max(X[:, 0])
    min_y = np.min(X[:, 1])
    max_y = np.max(X[:, 1])
    min_x = min_x - (max_x - min_x) * 0.05
    max_x = max_x + (max_x - min_x) * 0.05
    min_y = min_y - (max_y - min_y) * 0.05
    max_y = max_y + (max_y - min_y) * 0.05
    x_vals = np.linspace(min_x, max_x, 100)
    y_vals = np.linspace(min_y, max_y, 100)
    XX, YY = np.meshgrid(x_vals, y_vals)
    grid_r, grid_c = XX.shape
    ZZ = np.zeros((grid_r, grid_c))
    for i in range(grid_r):
        for j in range(grid_c):
            ZZ[i, j] = pred_fun(XX[i, j], YY[i, j])
    pl.contourf(XX, YY, ZZ, 100, cmap = pl.cm.coolwarm)
    pl.colorbar()
    pl.xlabel("x")
    pl.ylabel("y")
    
class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))
    
def gen_pred_fun(clf):
    def pred_fun(x1, x2):
        x = np.array([[x1, x2]])
        return clf.predict(x)[0]
    return pred_fun

def plot_labels(n_folds, n_classes, list_labels):
    ind = np.arange(n_folds)
    width = 0.15
    
    countings = []
    for labels in list_labels:
        labels = np.array(labels)
        countings.append([np.count_nonzero(labels == x) for x in range(n_classes)])
    
    class_bars = []
    for cls in range(n_classes):
        class_bars.append([l[cls] for l in countings])
    
    fig, ax = pl.subplots()
    i = 0
    for class_bar in class_bars:
        ax.bar(ind + width*i, class_bar, width, label='Clase '+str(i))
        i += 1
        
    ax.set_xticks(ind + 2*width / 3)
    ax.set_xticklabels(['Pliegue {}'.format(k) for k in range(n_folds)])
    pl.legend(loc="best")
    pl.title("Etiquetas")

In [None]:
from sklearn.svm import LinearSVC

linear_kick = LinearSVC()

linear_kick.fit(X_train, y_train)

LinearSVC()

In [None]:
print("Test error: {}".format(1-linear_kick.score(X_test, y_test)))
print("Training error: {}".format(1-linear_kick.score(X_train, y_train)))

Test error: 0.09392700939270093
Training error: 0.09706910459801021


In [None]:
from sklearn.svm import SVC

poly_svm = SVC(kernel='poly', degree=2)
poly_svm.fit(X_train, y_train);

In [None]:
print("Training error: {}".format(1-poly_svm.score(X_train, y_train)))
print("Test error: {}".format(1-poly_svm.score(X_test, y_test)))

Training error: 0.09706910459801021
Test error: 0.09392700939270093


In [None]:
grid = [i for i in range(-10, 9,3)]
param_grid = {'C': [2**i for i in grid], 'gamma': [2**i for i in grid]}

In [None]:
print(param_grid['C'])

[0.125, 1, 10, 20]


In [None]:
print(param_grid['gamma'])

[0.125, 1, 10, 20]


In [None]:
clf = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, verbose=2)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ...............................C=0.125, gamma=0.125; total time=  19.6s
[CV] END ...............................C=0.125, gamma=0.125; total time=  19.7s
[CV] END ...............................C=0.125, gamma=0.125; total time=  22.5s
[CV] END ...............................C=0.125, gamma=0.125; total time=  20.8s
[CV] END ...............................C=0.125, gamma=0.125; total time=  20.7s
[CV] END ...................................C=0.125, gamma=1; total time=  19.5s
[CV] END ...................................C=0.125, gamma=1; total time=  20.8s
[CV] END ...................................C=0.125, gamma=1; total time=  20.3s
[CV] END ...................................C=0.125, gamma=1; total time=  20.7s
[CV] END ...................................C=0.125, gamma=1; total time=  19.7s
[CV] END ..................................C=0.125, gamma=10; total time=  20.9s
[CV] END ..................................C=0.1

GridSearchCV(estimator=SVC(kernel='linear'),
             param_grid={'C': [0.125, 1, 10, 20], 'gamma': [0.125, 1, 10, 20]},
             verbose=2)

In [None]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,18.080738,1.01325,2.598407,0.027941,0.125,0.125,"{'C': 0.125, 'gamma': 0.125}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
1,17.605686,0.519328,2.594867,0.011416,0.125,1.0,"{'C': 0.125, 'gamma': 1}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
2,17.803986,0.458709,2.61394,0.019154,0.125,10.0,"{'C': 0.125, 'gamma': 10}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
3,17.805053,0.77003,2.594652,0.011102,0.125,20.0,"{'C': 0.125, 'gamma': 20}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
4,34.029064,1.698352,3.708099,0.003357,1.0,0.125,"{'C': 1, 'gamma': 0.125}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
5,34.591642,1.235484,3.689495,0.011512,1.0,1.0,"{'C': 1, 'gamma': 1}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
6,33.199963,3.109879,3.698997,0.013456,1.0,10.0,"{'C': 1, 'gamma': 10}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
7,33.875837,1.652756,3.686012,0.021399,1.0,20.0,"{'C': 1, 'gamma': 20}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
8,196.724868,15.085455,9.443523,0.509016,10.0,0.125,"{'C': 10, 'gamma': 0.125}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1
9,203.785163,3.705044,10.468474,1.55372,10.0,1.0,"{'C': 10, 'gamma': 1}",0.90298,0.902868,0.902868,0.902969,0.902969,0.902931,5.1e-05,1


In [None]:
scores = clf.cv_results_['mean_test_score'].reshape(len(param_grid['C']),
                                                    len(param_grid['gamma']))

In [None]:
cv_results = cv_results[['param_C', 'param_gamma', 'mean_test_score']]
cv_results.sort_values(by='mean_test_score', ascending=False).head()

Unnamed: 0,param_C,param_gamma,mean_test_score
0,0.125,0.125,0.902931
1,0.125,1.0,0.902931
2,0.125,10.0,0.902931
3,0.125,20.0,0.902931
4,1.0,0.125,0.902931


In [None]:
print(clf.best_params_)

{'C': 0.125, 'gamma': 0.125}


In [None]:
print(clf.best_score_)

0.9029308971179779
