In [19]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing dataset
* separating out x and y
* adding monotonic constraints

In [20]:
s0 = pd.read_csv('02 data split 0.csv')
s1 = pd.read_csv('02 data split 1.csv')
s2 = pd.read_csv('02 data split 2.csv')
df = pd.concat([s0,s1,s2])
df.shape

(195, 12)

In [21]:
y = df['IsBadBuy']
x = df.drop(['IsBadBuy','RefId'], axis=1)
c = []
for i in x.columns:
    c.append(1)
c = tuple(c)
c

(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)

### 2. DT
* identifying optimal hyper-parameters
* performance

In [22]:
parameters = {"max_depth": [2,5,10,20],
              "min_samples_leaf": [2,5,10,20]}
model = DecisionTreeClassifier(monotonic_cst=c, random_state=0)
grid_search = GridSearchCV(estimator=model, param_grid=parameters, scoring='roc_auc', cv=3, n_jobs=6)
grid_search = grid_search.fit(x,y)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.7696536796536798
{'max_depth': 10, 'min_samples_leaf': 2}


In [23]:
max_depth1 = grid_search.best_params_['max_depth']
min_samples_leaf1 = grid_search.best_params_['min_samples_leaf']
model = DecisionTreeClassifier(max_depth=max_depth1, min_samples_leaf=min_samples_leaf1, monotonic_cst=c, random_state=0)

model.fit(x,y)
print(x.shape)
pred2 = []
pred1 = model.predict_proba(x)[:,1]
a1 = roc_auc_score(y,pred1)
print('auc roc  :',np.round(a1,3))

for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)
c1 = confusion_matrix(y,pred2)
p = c1[1][1] / (c1[0][1]+c1[1][1])
r = c1[1][1] / (c1[1][0]+c1[1][1])
f1 = (2*p*r) / (p+r)
print('f1 score :',np.round(f1,3))

(195, 10)
auc roc  : 0.831
f1 score : 0.639


### 3. holdout dataset
* providing predictions
* adding columns

In [24]:
df = pd.read_csv('06 data holdout.csv')
pred2 = []
pred1 = model.predict_proba(df)[:,1]
for j in pred1:
    if j > 0.30: pred2.append(1)
    else: pred2.append(0)

In [25]:
RefId = []
for i in range(1000-df.shape[0], 1000):
    RefId.append(i)
df['RefId'] = RefId
df['IsBadBuy'] = pred2
df['VehBCost'] = 6500
df['Make'] = np.where(df['Make']==1, 'FORD', 'DODGE')
df['WheelType'] = np.where(df['WheelType']==1, np.nan, 'Alloy')
df['VNST'] = np.where(df['VNST']==1, 'VA', 'FL')

df['MMRAcquisitionRetailAveragePrice'] = np.round(df['VehBCost'] / df['MMRAcquisitionRetailAveragePrice'],0)
df['MMRAcquisitonRetailCleanPrice'] = np.round(df['VehBCost'] / df['MMRAcquisitonRetailCleanPrice'],0)
df['MMRCurrentRetailAveragePrice'] = np.round(df['VehBCost'] / df['MMRCurrentRetailAveragePrice'],0)
df['MMRCurrentRetailCleanPrice'] = np.round(df['VehBCost'] / df['MMRCurrentRetailCleanPrice'],0)
df['WarrantyCost'] = np.round(df['VehBCost'] * df['WarrantyCost'],0)
df['VehicleAge'] = np.round(df['VehicleAge_decades']*10,0)
df['VehOdo'] = np.round(df['VehOdo_lakhs']*100000,0)

df = df[['RefId', 'IsBadBuy', 'VehBCost', 'Make', 'WheelType', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
         'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VNST', 'WarrantyCost', 'VehicleAge', 'VehOdo']]
df.shape

(98, 13)

### 4. exporting dataset
* there are 8 numerical variables
* there are 3 categorical variables
* there are 2 other variables - RefId and IsBadBuy	

In [26]:
df.to_csv('07 model holdout.csv', index=False)
df.shape

(98, 13)