In [1612]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0', 'distance'])

In [1613]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'S_create2SA_Create',
       'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now',
       'U_approve2now', 'prev_CW x SA_rate', 'type_RN', 'type_LVN+LPN',
       'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw',
       'areaName_austin', 'areaName_san', 'net_pay', 'target', 'createdAt',
       'Start_Time'],
      dtype='object')

# Data Prepration

In [1614]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san'], axis=1))
scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'target', 'createdAt',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### Slice df by the end of this week, for predcition output

In [1615]:
end_of_week = '2021-3-28'

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x >= pd.to_datetime(end_of_week))]

In [1616]:
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
realdata_len

588

###  <font color = green> Validation set: 1000 recently records

In [1617]:
# slice, dont include realdata
validation = df[-1000-realdata_len:-realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'createdAt', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    971
1     29
Name: target, dtype: int64

### Train test: main dataset - validation set

In [1618]:
df = df[:-1000-realdata_len] # slice 

In [1619]:
# # make a dataset that num of tar = num of non tar, use it for train test
# import random
# df_tar = df[df['target']==1].reset_index(drop = True)
# df_nontar = df[df['target']==0].reset_index(drop = True)

# number_of_tar = df_tar.shape[0]
# random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
# df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# # concat
# df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [1620]:
X = df.drop(['id','user_id', 'shift_id', 'target', 'createdAt', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [1621]:
df['target'].value_counts()

0    61941
1     4343
Name: target, dtype: int64

# Logistic Regression 1  

In [1622]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data
logit = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced')

### Train Test result

In [1623]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[11508  7044]
 [  524   810]]


              precision    recall  f1-score   support

           0       0.96      0.62      0.75     18552
           1       0.10      0.61      0.18      1334

    accuracy                           0.62     19886
   macro avg       0.53      0.61      0.46     19886
weighted avg       0.90      0.62      0.71     19886



In [1624]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [1625]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.232729
         Iterations 7


0,1,2,3
Dep. Variable:,target,No. Observations:,46398.0
Model:,Logit,Df Residuals:,46382.0
Method:,MLE,Df Model:,15.0
Date:,"Sun, 28 Mar 2021",Pseudo R-squ.:,0.03076
Time:,14:37:51,Log-Likelihood:,-10798.0
converged:,True,LL-Null:,-11141.0
Covariance Type:,nonrobust,LLR p-value:,2.482e-136

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-2.5262,0.175,-14.469,0.000,-2.868,-2.184
type_LVN+LPN,-1.0138,0.080,-12.610,0.000,-1.171,-0.856
segmentName_d,0.4880,0.100,4.880,0.000,0.292,0.684
areaName_houston,-2.9688,0.095,-31.243,0.000,-3.155,-2.783
areaName_no,-2.8289,0.103,-27.431,0.000,-3.031,-2.627
areaName_dfw,-2.9809,0.110,-27.024,0.000,-3.197,-2.765
areaName_austin,-2.7281,0.128,-21.319,0.000,-2.979,-2.477
areaName_san,-2.6444,0.140,-18.905,0.000,-2.919,-2.370
prev_CW/SA_rate,0.0752,0.014,5.344,0.000,0.048,0.103


### Overfitting? No

In [1626]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[26966 16423]
 [ 1187  1822]]


              precision    recall  f1-score   support

           0       0.96      0.62      0.75     43389
           1       0.10      0.61      0.17      3009

    accuracy                           0.62     46398
   macro avg       0.53      0.61      0.46     46398
weighted avg       0.90      0.62      0.72     46398



### <font color = green> Validation result

In [1630]:
# test threshold
limiter = .7

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[945  26]
 [ 24   5]]


              precision    recall  f1-score   support

           0       0.98      0.97      0.97       971
           1       0.16      0.17      0.17        29

    accuracy                           0.95      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.95      0.95      0.95      1000



# Logistic Regression 2 Not good

###  <font color = green> Validation set

In [1584]:
y_valid = validation['target']
x_valid = validation[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 'areaName_dfw',
                      'areaName_austin', 'areaName_san', 'net_pay',]]

### Train Test set

In [1585]:
X = df[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 
        'areaName_dfw','areaName_austin', 'areaName_san', 'net_pay',]]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

# Fit real data in this model

In [1631]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'target', 'createdAt', 'Start_Time'], axis = 1)

In [1632]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [1633]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)

realdata.to_csv('pred_{}.csv'.format(time), index = False)

# Check real data in this model

In [1362]:
thisweek = pd.read_csv('download.csv')

In [1363]:
realdata_test = realdata[['id', 'prob']]
thisweek_validation = thisweek.merge(realdata_test, on = 'id', how = 'left')
thisweek_validation.dropna(subset=['prob'],inplace = True)

In [1364]:
thisweek_validation[['prob']].isna().sum()

prob    0
dtype: int64

In [1365]:
thisweek_validation

def CW_by_nurse(row):
    if row['status']=='withdrawn' and row['prevStatus'] == 'confirmed':
        if row['withdrawnInfo_value'] == 'nurse':
            if row['CW_Time2Start_Time'] < 0 and row['CW_Time2Start_Time'] >= -24:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        return 0
    
thisweek_validation['target'] = thisweek_validation.apply (lambda row: CW_by_nurse(row), axis=1)

In [1366]:
thisweek_validation = thisweek_validation[['id', 'prob', 'target', 'net_pay', 'type', 'segmentName', 'areaName']]

# thisweek_validation.to_csv('week322_check.csv', index = False)

In [1367]:
limiter = .5


y_prob = thisweek_validation['prob']
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count += 1
    else:
        y_pred.append(0)
        
print(confusion_matrix(thisweek_validation['target'], y_pred))
print('\n')
print(classification_report(thisweek_validation['target'], y_pred))

[[328 762]
 [  3  17]]


              precision    recall  f1-score   support

           0       0.99      0.30      0.46      1090
           1       0.02      0.85      0.04        20

    accuracy                           0.31      1110
   macro avg       0.51      0.58      0.25      1110
weighted avg       0.97      0.31      0.45      1110



In [1296]:
thisweek_validation.groupby("target")['prob'].mean()

target
0    0.547651
1    0.606011
Name: prob, dtype: float64