In [295]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

# Data Prepration

In [296]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['target', 'segmentName_d', 'type_d', 'createdAt', 'start_time'], axis=1))
scaled_features = scaler.transform(df.drop(['target', 'segmentName_d', 'type_d', 'createdAt', 'start_time'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'S_create2SA_Create', 'S_Create2Start_Time', 
                                             'SA_Create2Start_Time', 'U_create2now', 'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([X, df[['segmentName_d', 'type_d', 'target', 'createdAt', 'start_time']]], axis = 1)

# drop nas
df.dropna(inplace = True)

###  <font color = green> Validation set: 1000 recently records

In [297]:
# slice
validation = df[-1000:]

y_valid = validation['target']
x_valid = validation.drop(['createdAt', 'start_time', 'target'], axis = 1)

y_valid.value_counts()

0    969
1     31
Name: target, dtype: int64

### Train test: main dataset - validation set

In [298]:
df = df[:-1000] # slice 

In [299]:
# make a dataset that num of tar = num of non tar, use it for train test
import random
df_tar = df[df['target']==1].reset_index(drop = True)
df_nontar = df[df['target']==0].reset_index(drop = True)

number_of_tar = df_tar.shape[0]
random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# concat
df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [300]:
X = df.drop(['target', 'createdAt', 'start_time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Decision Trees

In [301]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier()

### Train Test result

In [302]:
from sklearn.metrics import classification_report, confusion_matrix
# predict
predictions = dtree.predict(X_test)

print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[760 538]
 [559 747]]


              precision    recall  f1-score   support

           0       0.58      0.59      0.58      1298
           1       0.58      0.57      0.58      1306

    accuracy                           0.58      2604
   macro avg       0.58      0.58      0.58      2604
weighted avg       0.58      0.58      0.58      2604



### <font color = green> Validation result

In [303]:
# predict
predictions = dtree.predict(x_valid)

print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[558 411]
 [ 10  21]]


              precision    recall  f1-score   support

           0       0.98      0.58      0.73       969
           1       0.05      0.68      0.09        31

    accuracy                           0.58      1000
   macro avg       0.52      0.63      0.41      1000
weighted avg       0.95      0.58      0.71      1000



# Random Forests


In [304]:
# fit
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [305]:
### Train Test result

In [306]:
# predict
rfc_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

[[844 454]
 [478 828]]


              precision    recall  f1-score   support

           0       0.64      0.65      0.64      1298
           1       0.65      0.63      0.64      1306

    accuracy                           0.64      2604
   macro avg       0.64      0.64      0.64      2604
weighted avg       0.64      0.64      0.64      2604



### <font color = green> Validation result

In [307]:
# predict
predictions = rfc.predict(x_valid)

print(confusion_matrix(y_valid, predictions))
print('\n')
print(classification_report(y_valid, predictions))

[[611 358]
 [  8  23]]


              precision    recall  f1-score   support

           0       0.99      0.63      0.77       969
           1       0.06      0.74      0.11        31

    accuracy                           0.63      1000
   macro avg       0.52      0.69      0.44      1000
weighted avg       0.96      0.63      0.75      1000



# Random Forests with significant var by LR
 

###  <font color = green> Validation set

In [308]:
y_valid = validation['target']
x_valid = validation[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]

y_valid.value_counts()

0    969
1     31
Name: target, dtype: int64

### Train Test set

In [309]:
X = df[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

### Fit

In [310]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

### train test result

In [311]:
# predict
rfc_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

[[843 500]
 [500 761]]


              precision    recall  f1-score   support

           0       0.63      0.63      0.63      1343
           1       0.60      0.60      0.60      1261

    accuracy                           0.62      2604
   macro avg       0.62      0.62      0.62      2604
weighted avg       0.62      0.62      0.62      2604



### <font color = green> Validation result

In [312]:
# predict
predictions = rfc.predict(x_valid)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[595 374]
 [  8  23]]


              precision    recall  f1-score   support

           0       0.99      0.61      0.76       969
           1       0.06      0.74      0.11        31

    accuracy                           0.62      1000
   macro avg       0.52      0.68      0.43      1000
weighted avg       0.96      0.62      0.74      1000



In [313]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_valid,predictions), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')