In [540]:
import pandas as pd
import numpy as np


df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

# Data Prepration

In [541]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt', 'start_time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san'], axis=1))
scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt',
                                            'start_time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'target', 'createdAt',
                    'start_time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### set future data point as realdata

### note !!!: real data might overlap with train test validation data

In [542]:
df['start_time'] = pd.to_datetime(df['start_time'])
realdata = df[df['start_time'].isin(pd.date_range('2021-3-22', '2021-3-29'))]

###  <font color = green> Validation set: 1000 recently records

In [543]:
# slice
validation = df[-1000:]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'createdAt', 'start_time', 'target'], axis = 1)

y_valid.value_counts()

0    984
1     16
Name: target, dtype: int64

### Train test: main dataset - validation set

In [544]:
df = df[:-1000] # slice 

In [545]:
# make a dataset that num of tar = num of non tar, use it for train test
import random
df_tar = df[df['target']==1].reset_index(drop = True)
df_nontar = df[df['target']==0].reset_index(drop = True)

number_of_tar = df_tar.shape[0]
random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# concat
df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [546]:
X = df.drop(['id','user_id', 'shift_id', 'target', 'createdAt', 'start_time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Decision Trees

In [547]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

weights = {0:1.0, 1:2}
dtree = DecisionTreeClassifier(class_weight = weights)
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight={0: 1.0, 1: 2})

### Train Test result

In [478]:
from sklearn.metrics import classification_report, confusion_matrix
# predict
predictions = dtree.predict(X_test)

print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[790 530]
 [570 724]]


              precision    recall  f1-score   support

           0       0.58      0.60      0.59      1320
           1       0.58      0.56      0.57      1294

    accuracy                           0.58      2614
   macro avg       0.58      0.58      0.58      2614
weighted avg       0.58      0.58      0.58      2614



### <font color = green> Validation result

In [479]:
# predict
predictions = dtree.predict(x_valid)

print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[619 365]
 [  4  12]]


              precision    recall  f1-score   support

           0       0.99      0.63      0.77       984
           1       0.03      0.75      0.06        16

    accuracy                           0.63      1000
   macro avg       0.51      0.69      0.42      1000
weighted avg       0.98      0.63      0.76      1000



# Random Forests


In [480]:
# fit
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [481]:
### Train Test result

In [482]:
# predict
rfc_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

[[880 440]
 [461 833]]


              precision    recall  f1-score   support

           0       0.66      0.67      0.66      1320
           1       0.65      0.64      0.65      1294

    accuracy                           0.66      2614
   macro avg       0.66      0.66      0.66      2614
weighted avg       0.66      0.66      0.66      2614



### <font color = green> Validation result

In [483]:
# predict
predictions = rfc.predict(x_valid)

print(confusion_matrix(y_valid, predictions))
print('\n')
print(classification_report(y_valid, predictions))

[[600 384]
 [  3  13]]


              precision    recall  f1-score   support

           0       1.00      0.61      0.76       984
           1       0.03      0.81      0.06        16

    accuracy                           0.61      1000
   macro avg       0.51      0.71      0.41      1000
weighted avg       0.98      0.61      0.75      1000



# Random Forests with significant var by LR
 

###  <font color = green> Validation set

In [515]:
y_valid = validation['target']
x_valid = validation[['prev_CW/SA_rate', 'net_pay', 'SA_Create2Start_Time', 'type_RN', 
                      'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
                      'areaName_dfw', 'areaName_austin', 'areaName_san']]

### Train Test set

In [516]:
X = df[['prev_CW/SA_rate', 'net_pay', 'SA_Create2Start_Time', 'type_RN', 
        'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
        'areaName_dfw', 'areaName_austin', 'areaName_san']]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

### Fit

In [506]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

### train test result

In [331]:
# predict
rfc_pred = rfc.predict(X_test)

print(confusion_matrix(y_test,rfc_pred))
print('\n')
print(classification_report(y_test,rfc_pred))

[[806 502]
 [506 800]]


              precision    recall  f1-score   support

           0       0.61      0.62      0.62      1308
           1       0.61      0.61      0.61      1306

    accuracy                           0.61      2614
   macro avg       0.61      0.61      0.61      2614
weighted avg       0.61      0.61      0.61      2614



### <font color = green> Validation result

In [332]:
# predict
predictions = rfc.predict(x_valid)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[571 413]
 [  2  14]]


              precision    recall  f1-score   support

           0       1.00      0.58      0.73       984
           1       0.03      0.88      0.06        16

    accuracy                           0.58      1000
   macro avg       0.51      0.73      0.40      1000
weighted avg       0.98      0.58      0.72      1000



In [313]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_valid,predictions), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')