In [1300]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

In [1301]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'S_create2SA_Create',
       'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now',
       'U_approve2now', 'prev_CW x SA_rate', 'type_RN', 'type_LVN+LPN',
       'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw',
       'areaName_austin', 'areaName_san', 'net_pay', 'target', 'createdAt',
       'Start_Time'],
      dtype='object')

# Data Prepration

In [1302]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san'], axis=1))
scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'target', 'createdAt',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### set future data point as realdata

### note !!!: real data might overlap with train test validation data

In [1303]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
realdata = df[df['Start_Time'].apply(lambda x: x >= pd.to_datetime('2021-3-22') and x < pd.to_datetime('2021-3-28'))]

###  <font color = green> Validation set: 1000 recently records

In [1304]:
# slice
validation = df[-1000:]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'createdAt', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    984
1     16
Name: target, dtype: int64

### Train test: main dataset - validation set

In [1305]:
df = df[:-1000] # slice 

In [1306]:
# make a dataset that num of tar = num of non tar, use it for train test
import random
df_tar = df[df['target']==1].reset_index(drop = True)
df_nontar = df[df['target']==0].reset_index(drop = True)

number_of_tar = df_tar.shape[0]
random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# concat
df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [1307]:
X = df.drop(['id','user_id', 'shift_id', 'target', 'createdAt', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Logistic Regression 1  

In [1308]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

weights = {0:1, 1:1.5}
logit = LogisticRegression(solver = 'lbfgs', class_weight = weights)
logit.fit(X_train,y_train)

LogisticRegression(class_weight={0: 1, 1: 1.5})

### Train Test result

In [1309]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[ 558  755]
 [ 230 1071]]


              precision    recall  f1-score   support

           0       0.71      0.42      0.53      1313
           1       0.59      0.82      0.69      1301

    accuracy                           0.62      2614
   macro avg       0.65      0.62      0.61      2614
weighted avg       0.65      0.62      0.61      2614



In [1310]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [1311]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.643756
         Iterations 5


0,1,2,3
Dep. Variable:,target,No. Observations:,6098.0
Model:,Logit,Df Residuals:,6082.0
Method:,MLE,Df Model:,15.0
Date:,"Fri, 26 Mar 2021",Pseudo R-squ.:,0.07125
Time:,12:39:24,Log-Likelihood:,-3925.6
converged:,True,LL-Null:,-4226.8
Covariance Type:,nonrobust,LLR p-value:,1.126e-118

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-3.2673,0.282,-11.589,0.000,-3.820,-2.715
type_LVN+LPN,-1.4330,0.132,-10.870,0.000,-1.691,-1.175
segmentName_d,0.6277,0.125,5.035,0.000,0.383,0.872
areaName_houston,-0.5028,0.121,-4.173,0.000,-0.739,-0.267
areaName_no,0.0500,0.134,0.372,0.710,-0.213,0.313
areaName_dfw,-0.4180,0.143,-2.932,0.003,-0.697,-0.139
areaName_austin,-0.1444,0.174,-0.831,0.406,-0.485,0.196
areaName_san,0.2883,0.206,1.401,0.161,-0.115,0.691
prev_CW/SA_rate,0.1566,0.028,5.624,0.000,0.102,0.211


### Overfitting? No

In [1312]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[1274 1769]
 [ 565 2490]]


              precision    recall  f1-score   support

           0       0.69      0.42      0.52      3043
           1       0.58      0.82      0.68      3055

    accuracy                           0.62      6098
   macro avg       0.64      0.62      0.60      6098
weighted avg       0.64      0.62      0.60      6098



### <font color = green> Validation result

In [1313]:
# test threshold
limiter = .6

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[640 344]
 [  1  15]]


              precision    recall  f1-score   support

           0       1.00      0.65      0.79       984
           1       0.04      0.94      0.08        16

    accuracy                           0.66      1000
   macro avg       0.52      0.79      0.43      1000
weighted avg       0.98      0.66      0.78      1000



In [1314]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_valid, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

# Logistic Regression 2, pick only significant vars

###  <font color = green> Validation set

In [1315]:
y_valid = validation['target']
x_valid = validation[['net_pay', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 'prev_CW/SA_rate', 'areaName_houston']]

### Train Test set

In [1316]:
X = df[['net_pay', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 'prev_CW/SA_rate', 'areaName_houston']]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

### Fit

In [1317]:

weights = {0:1, 1:1.3}
logit = LogisticRegression(solver = 'lbfgs', class_weight = weights)
logit.fit(X_train,y_train)

# logit summary
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.654068
         Iterations 5


0,1,2,3
Dep. Variable:,target,No. Observations:,6098.0
Model:,Logit,Df Residuals:,6092.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 26 Mar 2021",Pseudo R-squ.:,0.05637
Time:,12:39:24,Log-Likelihood:,-3988.5
converged:,True,LL-Null:,-4226.8
Covariance Type:,nonrobust,LLR p-value:,9.155e-101

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
net_pay,1.1469,0.072,15.976,0.000,1.006,1.288
type_RN,-3.4525,0.251,-13.754,0.000,-3.944,-2.960
type_LVN+LPN,-1.7137,0.129,-13.324,0.000,-1.966,-1.462
segmentName_d,0.5380,0.041,12.988,0.000,0.457,0.619
prev_CW/SA_rate,0.1147,0.025,4.576,0.000,0.066,0.164
areaName_houston,-0.2550,0.056,-4.563,0.000,-0.365,-0.145


In [1267]:
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [1268]:
y_pred= logit.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[ 572  732]
 [ 285 1025]]


              precision    recall  f1-score   support

           0       0.67      0.44      0.53      1304
           1       0.58      0.78      0.67      1310

    accuracy                           0.61      2614
   macro avg       0.63      0.61      0.60      2614
weighted avg       0.63      0.61      0.60      2614



### Overfitting? No

In [1269]:
y_pred= logit.predict(X_train)
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[1312 1740]
 [ 659 2387]]


              precision    recall  f1-score   support

           0       0.67      0.43      0.52      3052
           1       0.58      0.78      0.67      3046

    accuracy                           0.61      6098
   macro avg       0.62      0.61      0.59      6098
weighted avg       0.62      0.61      0.59      6098



### <font color = green> Validation set result

In [1270]:
limiter = .6

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count += 1
    else:
        y_pred.append(0)
        
print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[676 308]
 [  1  15]]


              precision    recall  f1-score   support

           0       1.00      0.69      0.81       984
           1       0.05      0.94      0.09        16

    accuracy                           0.69      1000
   macro avg       0.52      0.81      0.45      1000
weighted avg       0.98      0.69      0.80      1000



In [1271]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_valid, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

# Fit real data in this model

In [1272]:
# set input
real_X = realdata[['net_pay', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 'prev_CW/SA_rate', 'areaName_houston']]

In [1273]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [1274]:
realdata.to_csv('week322_pred.csv', index = False)

# Check real data in this model

In [1286]:
thisweek = pd.read_csv('download.csv')

In [1287]:
realdata_test = realdata[['id', 'prob']]
thisweek_validation = thisweek.merge(realdata_test, on = 'id', how = 'left')
thisweek_validation.dropna(subset=['prob'],inplace = True)

In [1288]:
thisweek_validation[['prob']].isna().sum()

prob    0
dtype: int64

In [1289]:
thisweek_validation

def CW_by_nurse(row):
    if row['status']=='withdrawn' and row['prevStatus'] == 'confirmed':
        if row['withdrawnInfo_value'] == 'nurse':
            if row['CW_Time2Start_Time'] < 0 and row['CW_Time2Start_Time'] >= -24:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        return 0
    
thisweek_validation['target'] = thisweek_validation.apply (lambda row: CW_by_nurse(row), axis=1)

In [1290]:
thisweek_validation = thisweek_validation[['id', 'prob', 'target', 'net_pay', 'type', 'segmentName', 'areaName']]

# thisweek_validation.to_csv('week322_check.csv', index = False)

In [1299]:
limiter = .5


y_prob = thisweek_validation['prob']
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count += 1
    else:
        y_pred.append(0)
        
print(confusion_matrix(thisweek_validation['target'], y_pred))
print('\n')
print(classification_report(thisweek_validation['target'], y_pred))

[[353 737]
 [  5  15]]


              precision    recall  f1-score   support

           0       0.99      0.32      0.49      1090
           1       0.02      0.75      0.04        20

    accuracy                           0.33      1110
   macro avg       0.50      0.54      0.26      1110
weighted avg       0.97      0.33      0.48      1110



In [1295]:
thisweek_validation[thisweek_validation['target']==1].sort_values(by = 'prob', ascending = False)

Unnamed: 0,id,prob,target,net_pay,type,segmentName,areaName
1244,170132,0.862836,1,35.0,LVN,Senior Living,San Antonio
589,164683,0.827512,1,23.0,CNA,Senior Living,Austin
591,164684,0.821838,1,23.0,CNA,Senior Living,Austin
125,170335,0.818625,1,24.0,STNA,Senior Living,Northeast Ohio
1170,170159,0.707361,1,18.5,CNA,Senior Living,DFW
954,169819,0.674967,1,39.0,LPN,Senior Living,Northeast Ohio
1340,170381,0.614154,1,18.0,CNA,Senior Living,San Antonio
345,170012,0.593766,1,19.5,STNA,Senior Living,Northeast Ohio
1342,170425,0.5788,1,18.5,CNA,Senior Living,San Antonio
603,167986,0.569924,1,21.0,CNA,Senior Living,San Antonio


In [1296]:
thisweek_validation.groupby("target")['prob'].mean()

target
0    0.547651
1    0.606011
Name: prob, dtype: float64