In [1828]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0', 'distance'])

In [1829]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
       'areaName_dfw', 'areaName_austin', 'areaName_san', 'net_pay', 'target',
       'createdAt', 'Start_Time'],
      dtype='object')

# Data Prepration

In [1830]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'createdAt', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san'], axis=1))
scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'createdAt',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'status', 'target', 'createdAt',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### Slice df by the end of this week, for predcition output

In [1831]:
end_of_week = '2021-3-29'

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x >= pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

In [1832]:
realdata_len

0

In [1834]:
df

Unnamed: 0,id,user_id,shift_id,status,target,createdAt,Start_Time,type_RN,type_LVN+LPN,segmentName_d,...,areaName_austin,areaName_san,prev_CW/SA_rate,prev_CW x SA_rate,S_create2SA_Create,S_Create2Start_Time,SA_Create2Start_Time,U_create2now,U_approve2now,net_pay
214,29316,1721,12515,confirmed,0,2019-09-12 20:00:22.553699,2019-09-14 06:00:00,0,0,1,...,0,0,-0.536648,-0.415223,-0.855313,-0.796289,1.529298,1.165478,-0.363432,-0.988077
170,29160,4675,12488,confirmed,0,2019-09-11 21:12:59.058392,2019-09-14 06:00:00,0,0,1,...,0,0,-0.536648,-0.553756,-0.845526,-0.654632,0.986908,1.257228,-0.363432,-0.770445
88,28864,531,12299,withdrawn,0,2019-09-10 15:00:22.918030,2019-09-14 06:45:00,1,0,0,...,0,0,-0.536648,-0.487788,-0.661755,-0.462192,2.234790,1.208485,-0.363432,2.167580
87,28864,531,12299,withdrawn,0,2019-09-10 15:00:22.918030,2019-09-14 06:45:00,1,0,0,...,0,0,-0.536648,-0.487788,-0.661755,-0.462192,2.234790,1.208485,-0.363432,2.167580
2,26912,4733,11310,confirmed,0,2019-08-27 19:16:40.669964,2019-09-14 07:00:00,0,0,1,...,0,0,-0.536648,-0.534947,0.798190,1.595090,0.973140,1.177310,-0.363432,-0.661630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65452,170702,15697,124295,confirmed,0,2021-03-22 16:33:39.674771,2021-03-28 22:30:00,0,0,1,...,0,1,0.003954,-0.553019,-0.425221,-0.075601,-1.506883,-1.541047,-0.338120,-0.552814
65659,171179,6723,125251,withdrawn,0,2021-03-23 23:39:12.320638,2021-03-28 23:00:00,0,0,1,...,0,0,0.177157,-0.538898,-0.558124,-0.271970,0.596293,0.795047,0.863483,0.100081
65660,171179,6723,125251,withdrawn,0,2021-03-23 23:39:12.320638,2021-03-28 23:00:00,0,0,1,...,0,0,0.173709,-0.538898,-0.558124,-0.271970,0.596293,0.795047,0.869439,0.100081
65905,172081,993,126324,confirmed,0,2021-03-26 19:51:40.379743,2021-03-28 23:00:00,0,0,1,...,0,0,0.324937,-0.556047,-0.872502,-0.689718,1.894060,0.216398,0.923042,-0.770445


###  <font color = green> Validation set: 1000 recently records

In [1797]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'createdAt', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    973
1     27
Name: target, dtype: int64

### Train test: main dataset - validation set

In [1798]:
df = df[:-1000-realdata_len] # slice 

In [1799]:
# # make a dataset that num of tar = num of non tar, use it for train test
# import random
# df_tar = df[df['target']==1].reset_index(drop = True)
# df_nontar = df[df['target']==0].reset_index(drop = True)

# number_of_tar = df_tar.shape[0]
# random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
# df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# # concat
# df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [1800]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'createdAt', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [1801]:
df['target'].value_counts()

0    62005
1     4345
Name: target, dtype: int64

# Logistic Regression 1  

In [1802]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data
logit = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced')

### Train Test result

In [1803]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[11553  7072]
 [  515   765]]


              precision    recall  f1-score   support

           0       0.96      0.62      0.75     18625
           1       0.10      0.60      0.17      1280

    accuracy                           0.62     19905
   macro avg       0.53      0.61      0.46     19905
weighted avg       0.90      0.62      0.72     19905



In [1804]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [1805]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.235334
         Iterations 7


0,1,2,3
Dep. Variable:,target,No. Observations:,46445.0
Model:,Logit,Df Residuals:,46429.0
Method:,MLE,Df Model:,15.0
Date:,"Sun, 28 Mar 2021",Pseudo R-squ.:,0.03213
Time:,18:35:57,Log-Likelihood:,-10930.0
converged:,True,LL-Null:,-11293.0
Covariance Type:,nonrobust,LLR p-value:,6.412e-145

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-2.6355,0.174,-15.178,0.000,-2.976,-2.295
type_LVN+LPN,-1.1000,0.080,-13.709,0.000,-1.257,-0.943
segmentName_d,0.5200,0.100,5.202,0.000,0.324,0.716
areaName_houston,-2.9672,0.095,-31.288,0.000,-3.153,-2.781
areaName_no,-2.8247,0.103,-27.418,0.000,-3.027,-2.623
areaName_dfw,-2.9590,0.110,-26.924,0.000,-3.174,-2.744
areaName_austin,-2.7655,0.128,-21.523,0.000,-3.017,-2.514
areaName_san,-2.5709,0.137,-18.766,0.000,-2.839,-2.302
prev_CW/SA_rate,0.0653,0.014,4.595,0.000,0.037,0.093


### Overfitting? No

In [1806]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[26901 16479]
 [ 1203  1862]]


              precision    recall  f1-score   support

           0       0.96      0.62      0.75     43380
           1       0.10      0.61      0.17      3065

    accuracy                           0.62     46445
   macro avg       0.53      0.61      0.46     46445
weighted avg       0.90      0.62      0.71     46445



### <font color = green> Validation result

In [1807]:
# test threshold
limiter = .6

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[880  93]
 [ 12  15]]


              precision    recall  f1-score   support

           0       0.99      0.90      0.94       973
           1       0.14      0.56      0.22        27

    accuracy                           0.90      1000
   macro avg       0.56      0.73      0.58      1000
weighted avg       0.96      0.90      0.92      1000



# Logistic Regression 2 Not good

###  <font color = green> Validation set

In [1808]:
# y_valid = validation['target']
# x_valid = validation[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 'areaName_dfw',
#                       'areaName_austin', 'areaName_san', 'net_pay',]]

### Train Test set

In [1809]:
# X = df[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 
#         'areaName_dfw','areaName_austin', 'areaName_san', 'net_pay',]]
# Y = df['target']
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

# Fit real data in this model

In [1810]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'createdAt', 'Start_Time'], axis = 1)

In [1811]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [1812]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)

realdata[['id', 'Start_Time', 'prob']].to_csv('pred_{}.csv'.format(time), index = False)

# Check real data in this model

In [1787]:
thisweek = pd.read_csv('download.csv')
thisweek = thisweek[thisweek.apply(lambda row: (row['prevStatus'] == 'confirmed') or (row['status'] == 'confirmed'), axis = 1)]

In [1363]:
realdata_test = realdata[['id', 'prob']]
thisweek_validation = thisweek.merge(realdata_test, on = 'id', how = 'left')
thisweek_validation.dropna(subset=['prob'],inplace = True)

In [1364]:
thisweek_validation[['prob']].isna().sum()

prob    0
dtype: int64

In [1365]:
thisweek_validation

def CW_by_nurse(row):
    if row['status']=='withdrawn' and row['prevStatus'] == 'confirmed':
        if row['withdrawnInfo_value'] == 'nurse':
            if row['CW_Time2Start_Time'] < 0 and row['CW_Time2Start_Time'] >= -24:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        return 0
    
thisweek_validation['target'] = thisweek_validation.apply (lambda row: CW_by_nurse(row), axis=1)

In [1366]:
thisweek_validation = thisweek_validation[['id', 'prob', 'target', 'net_pay', 'type', 'segmentName', 'areaName']]

# thisweek_validation.to_csv('week322_check.csv', index = False)

In [1367]:
limiter = .5


y_prob = thisweek_validation['prob']
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count += 1
    else:
        y_pred.append(0)
        
print(confusion_matrix(thisweek_validation['target'], y_pred))
print('\n')
print(classification_report(thisweek_validation['target'], y_pred))

[[328 762]
 [  3  17]]


              precision    recall  f1-score   support

           0       0.99      0.30      0.46      1090
           1       0.02      0.85      0.04        20

    accuracy                           0.31      1110
   macro avg       0.51      0.58      0.25      1110
weighted avg       0.97      0.31      0.45      1110



In [1296]:
thisweek_validation.groupby("target")['prob'].mean()

target
0    0.547651
1    0.606011
Name: prob, dtype: float64