In [82]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data_new.csv').drop(columns = ['Unnamed: 0','f_highrate','f_lowrate'])

In [83]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'type_STNA', 'segmentName_d', 'areaName_houston',
       'areaName_no', 'areaName_dfw', 'areaName_austin', 'areaName_san',
       'net_pay', 'target', 'sa_create', 'Start_Time', 'CW_in_a_month',
       'count_prev_SA', 'count_prev_CW', 'type_CNA', 'reliability_score'],
      dtype='object')

# Data Prepration

### Slice df by the end of this week, for predcition output

In [84]:

from datetime import date, timedelta
tmrrw = date.today() + timedelta(days=1)

end_of_week = str(tmrrw.year) + '-' + str(tmrrw.month) + '-' + str(tmrrw.day+1)

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x > pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

###  <font color = green> Validation set: 1000 recently records

In [85]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'sa_create', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    934
1     66
Name: target, dtype: int64

### Train test: main dataset - validation set

In [86]:
df = df[:-1000-realdata_len] # slice 

In [87]:
df = df.dropna()

In [88]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [89]:
# make experienment dataset
df1 = df[(df['areaName_no'] == 1) & (df['type_STNA']==1)]
X_exp = df1.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y_exp = df1['target']

In [90]:
df['target'].value_counts()

0    69261
1     5742
Name: target, dtype: int64

# Logistic Regression 1  

In [91]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data
logit = LogisticRegression(solver = 'lbfgs', max_iter=100000, class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced', max_iter=100000)

### Train Test result

In [92]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[15020  5705]
 [  439  1337]]


              precision    recall  f1-score   support

           0       0.97      0.72      0.83     20725
           1       0.19      0.75      0.30      1776

    accuracy                           0.73     22501
   macro avg       0.58      0.74      0.57     22501
weighted avg       0.91      0.73      0.79     22501



In [93]:
from sklearn.metrics import roc_curve
from numpy import sqrt
from numpy import argmax

# predict probabilities
yhat = logit.predict_proba(X_exp)
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_exp,yhat)

# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))

# locate the index of the largest g-mean
ix = argmax(gmeans)

lower_limiter = thresholds[ix]
print('Best Threshold=%f' % (lower_limiter))

Best Threshold=0.582050


In [94]:
# search thresholds for imbalanced classification
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

# predict probabilities
yhat = logit.predict_proba(X_exp)
# keep probabilities for the positive outcome only
probs = yhat[:, 1]
# define thresholds
thresholds = arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score(y_exp, to_labels(probs, t)) for t in thresholds]
# get best threshold
ix = argmax(scores)

higher_limiter = thresholds[ix]

print('Best threshold=%.3f' % (higher_limiter))

Best threshold=0.706


In [95]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [96]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.222447
         Iterations 33


0,1,2,3
Dep. Variable:,target,No. Observations:,52502.0
Model:,Logit,Df Residuals:,52480.0
Method:,MLE,Df Model:,21.0
Date:,"Mon, 31 May 2021",Pseudo R-squ.:,0.1692
Time:,10:36:19,Log-Likelihood:,-11679.0
converged:,True,LL-Null:,-14057.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
prev_CW/SA_rate,0.5790,0.256,2.260,0.024,0.077,1.081
S_create2SA_Create,0.0004,1264.195,3.29e-07,1.000,-2477.777,2477.778
S_Create2Start_Time,-0.0005,1264.195,-3.6e-07,1.000,-2477.778,2477.777
SA_Create2Start_Time,-0.0019,1264.195,-1.49e-06,1.000,-2477.779,2477.775
U_create2now,-0.0034,0.007,-0.487,0.626,-0.017,0.010
U_approve2now,-0.0071,0.008,-0.908,0.364,-0.022,0.008
prev_CW x SA_rate,0.0001,2.59e-05,4.707,0.000,7.1e-05,0.000
type_RN,-3.5626,0.176,-20.262,0.000,-3.907,-3.218
type_LVN+LPN,-1.3619,0.111,-12.244,0.000,-1.580,-1.144


### Overfitting? No

In [97]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[35327 13209]
 [  946  3020]]


              precision    recall  f1-score   support

           0       0.97      0.73      0.83     48536
           1       0.19      0.76      0.30      3966

    accuracy                           0.73     52502
   macro avg       0.58      0.74      0.57     52502
weighted avg       0.91      0.73      0.79     52502



In [98]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(X_train)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[43626  4910]
 [ 2074  1892]]


              precision    recall  f1-score   support

           0       0.95      0.90      0.93     48536
           1       0.28      0.48      0.35      3966

    accuracy                           0.87     52502
   macro avg       0.62      0.69      0.64     52502
weighted avg       0.90      0.87      0.88     52502



### <font color = green> Validation result

In [99]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[647 287]
 [ 17  49]]


              precision    recall  f1-score   support

           0       0.97      0.69      0.81       934
           1       0.15      0.74      0.24        66

    accuracy                           0.70      1000
   macro avg       0.56      0.72      0.53      1000
weighted avg       0.92      0.70      0.77      1000



In [100]:
from sklearn.metrics import recall_score

label_coverage = y_pred.count(1)/len(y_pred)
UCW_coverage = recall_score(y_valid, y_pred)

print('The limiter we adopt is %.3f' % (limiter))
print('By covering %.3f labeled as high probability of UCW, we have prepared for %.3f of real UCW' 
      % (label_coverage,UCW_coverage))

The limiter we adopt is 0.706
By covering 0.336 labeled as high probability of UCW, we have prepared for 0.742 of real UCW


# Fit real data in this model

In [101]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)

In [102]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [103]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)
limiter = round(limiter,2)
realdata[['id', 'Start_Time', 'prob']].head(5)

Unnamed: 0,id,Start_Time,prob
74570,194405,2021-06-02 05:00:00,0.478321
76905,190008,2021-06-02 05:00:00,0.601106
74944,194471,2021-06-02 05:00:00,0.554104
76950,194394,2021-06-02 05:00:00,0.842791
67242,195550,2021-06-02 05:45:00,0.60969


In [104]:
# to make the prediction doesn't include today
realdata['Start_Time']

74570   2021-06-02 05:00:00
76905   2021-06-02 05:00:00
74944   2021-06-02 05:00:00
76950   2021-06-02 05:00:00
67242   2021-06-02 05:45:00
                ...        
31186   2021-07-10 07:00:00
41149   2021-07-10 17:00:00
19719   2021-07-11 17:00:00
41150   2021-07-11 17:00:00
31187   2021-07-12 07:00:00
Name: Start_Time, Length: 832, dtype: datetime64[ns]

In [105]:
high_prob = realdata[['id', 'Start_Time', 'prob']][realdata['prob'] > limiter]

## Append newly processed data to prediction data

In [106]:
import pandas as pd
# specify connection to database
import psycopg2
connection = psycopg2.connect(
    host="nursedash-prod.cuzi2kducsnv.us-east-1.rds.amazonaws.com",
    database="nursedash",
    user="external_analyst",
    password="uDps8APganhSLc3K2xe7NtMPq")

### <font color = green> all time to chicago time, No withdrawn info

In [107]:
df = pd.read_sql_query("""

SELECT  sa.id, sa.user_id, sa.shift_id, f.id AS facility_id, sa."withdrawnInfo" -> 'initiator' as withdrawnInfo_value,
sa."status", sa."prevStatus", sa."distance", s."facility_id", "s"."description" AS "shift_description",
"s"."assigned_nurse_id", s."net_pay", "s"."unit" AS "s_unit",s."type",
"s"."qualifications" AS "s_qualifications", "s"."breakTime" AS "s_breakTime", sa."withdrawnInfo",
"f"."name" AS "facility_name","f"."short_name" AS "f_short_name", f."segmentName", f."areaName",
timezone('America/Chicago', s."createdAt") as s_create,
timezone('America/Chicago', sa."createdAt") as sa_create,
timezone('America/Chicago', u."approvedAt") as u_approve,
timezone('America/Chicago', u."createdAt") as u_create,
timezone('America/Chicago', sa."statusUpdatedAt") as sa_statusUpdate,
timezone('America/Chicago', timezone('UTC', s.start_time)) AS "Start_Time" 
FROM shifts s
INNER JOIN shift_applications sa ON s.id = sa.shift_id
INNER JOIN facilities f ON s.facility_id = f.id
INNER JOIN users u ON sa.user_id = u.id

""", con = connection)

In [108]:
df.columns

Index(['id', 'user_id', 'shift_id', 'facility_id', 'withdrawninfo_value',
       'status', 'prevStatus', 'distance', 'facility_id', 'shift_description',
       'assigned_nurse_id', 'net_pay', 's_unit', 'type', 's_qualifications',
       's_breakTime', 'withdrawnInfo', 'facility_name', 'f_short_name',
       'segmentName', 'areaName', 's_create', 'sa_create', 'u_approve',
       'u_create', 'sa_statusupdate', 'Start_Time'],
      dtype='object')

In [109]:
def get_part_of_day(hour):
    return (
        "morning" if 4 < hour <= 12
        else
        "afternoon" if 12 < hour <= 17
        else
        "evening/night" if 18 < hour <= 22
        else
        "overnight"

    )

df['Start_time_of_the_day'] = df.apply(lambda row: get_part_of_day(row['Start_Time'].hour), axis =1)

# combine the prediction file with real data

In [110]:
# read the prediction file
prediction = realdata[['id', 'Start_Time', 'prob']]
validation = prediction.merge(df, on = 'id', how = 'left')

In [111]:
from datetime import date

today = date.today()

# convert to datetime for conditonal selection
validation['Start_Time_x'] = pd.to_datetime(validation['Start_Time_x'])

# only select date part of the time
validation['Start_Time_x'] = validation.apply(lambda row: str(row['Start_Time_x'].date()), axis = 1)

In [112]:
# rename start time
validation = validation.rename(columns={"Start_Time_x": "Start_Time"})

# limit our result to what we want as validation file
validation = validation[['id','prob','Start_Time','Start_time_of_the_day','status','type','prevStatus','areaName','segmentName','facility_name','user_id']]

In [113]:
validation = validation.set_index("id")
validation.columns

Index(['prob', 'Start_Time', 'Start_time_of_the_day', 'status', 'type',
       'prevStatus', 'areaName', 'segmentName', 'facility_name', 'user_id'],
      dtype='object')

In [114]:
validation.to_csv('pred_{}_Silver_Bullet.csv'.format(time))

In [115]:
validation#.head(30)

Unnamed: 0_level_0,prob,Start_Time,Start_time_of_the_day,status,type,prevStatus,areaName,segmentName,facility_name,user_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
194405,0.478321,2021-06-02,morning,confirmed,STNA,selected,Northeast Ohio,Senior Living,Avenue at Broadview Heights,17061
190008,0.601106,2021-06-02,morning,confirmed,STNA,selected,Northeast Ohio,Senior Living,CareCore at Willowood,18857
194471,0.554104,2021-06-02,morning,confirmed,STNA,selected,Northeast Ohio,Senior Living,Avenue at Broadview Heights,17296
194394,0.842791,2021-06-02,morning,confirmed,STNA,selected,Northeast Ohio,Senior Living,Avenue at Broadview Heights,18996
195550,0.609690,2021-06-02,morning,confirmed,CNA,selected,Houston,Senior Living,Heartis Clear Lake,14488
...,...,...,...,...,...,...,...,...,...,...
191649,0.006786,2021-07-10,morning,confirmed,RN,selected,Houston,Healthcare,Woodlands Specialty Hospital,7894
192531,0.028987,2021-07-10,afternoon,confirmed,STNA,selected,Northeast Ohio,Senior Living,CareCore at Willowood,9428
192145,0.135550,2021-07-11,afternoon,confirmed,STNA,selected,Northeast Ohio,Senior Living,CareCore at Willowood,4733
192532,0.027985,2021-07-11,afternoon,confirmed,STNA,selected,Northeast Ohio,Senior Living,CareCore at Willowood,9428


In [116]:
# select only northeast ohio and stna, and make a pivot table
# create a column called count
pivot_table = validation[(validation['areaName'] == 'Northeast Ohio') & 
           (validation['type'] == 'STNA') & (validation['prob'] > 0.55)].groupby(["Start_Time",
                                    "Start_time_of_the_day"]).size().reset_index(name='count').set_index("Start_Time")


In [117]:
# check if the count is above limiter of 5
# pivot_table['above_limiter'] = pivot_table.apply(lambda row: 2 if row['count'] >= 5 else 0, axis =1)

In [118]:
pivot_table.head(20)

Unnamed: 0_level_0,Start_time_of_the_day,count
Start_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-06-02,afternoon,3
2021-06-02,morning,4
2021-06-02,overnight,2
2021-06-03,afternoon,3
2021-06-03,morning,2
2021-06-03,overnight,1
2021-06-04,morning,2
2021-06-05,morning,3
2021-06-05,overnight,1
2021-06-06,afternoon,1


In [119]:
pivot_table.to_excel("plan2_pred_{}.xlsx".format(time))