In [115]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0','f_highrate','f_lowrate'])

In [116]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'type_STNA', 'segmentName_d', 'areaName_houston',
       'areaName_no', 'areaName_dfw', 'areaName_austin', 'areaName_san',
       'net_pay', 'target', 'sa_create', 'Start_Time', 'CW_in_a_month',
       'count_prev_SA', 'count_prev_CW'],
      dtype='object')

# Data Prepration

### Slice df by the end of this week, for predcition output

In [117]:

from datetime import date
end_of_week = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day+1)

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x > pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

###  <font color = green> Validation set: 1000 recently records

In [118]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'sa_create', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    915
1     85
Name: target, dtype: int64

### Train test: main dataset - validation set

In [119]:
df = df[:-1000-realdata_len] # slice 

In [120]:
df = df.dropna()

In [121]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [122]:
# make experienment dataset
df1 = df[(df['areaName_no'] == 1) & (df['type_STNA']==1)]
X_exp = df1.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y_exp = df1['target']

In [123]:
df['target'].value_counts()

0    67463
1     5560
Name: target, dtype: int64

# Logistic Regression 1  

In [124]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data
logit = LogisticRegression(solver = 'lbfgs', max_iter=100000, class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced', max_iter=100000)

### Train Test result

In [125]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[13572  6674]
 [  571  1090]]


              precision    recall  f1-score   support

           0       0.96      0.67      0.79     20246
           1       0.14      0.66      0.23      1661

    accuracy                           0.67     21907
   macro avg       0.55      0.66      0.51     21907
weighted avg       0.90      0.67      0.75     21907



In [126]:
from sklearn.metrics import roc_curve
from numpy import sqrt
from numpy import argmax

# predict probabilities
yhat = logit.predict_proba(X_exp)
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_exp,yhat)

# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))

# locate the index of the largest g-mean
ix = argmax(gmeans)

lower_limiter = thresholds[ix]
print('Best Threshold=%f' % (lower_limiter))

Best Threshold=0.555212


In [127]:
# search thresholds for imbalanced classification
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

# predict probabilities
yhat = logit.predict_proba(X_exp)
# keep probabilities for the positive outcome only
probs = yhat[:, 1]
# define thresholds
thresholds = arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score(y_exp, to_labels(probs, t)) for t in thresholds]
# get best threshold
ix = argmax(scores)

higher_limiter = thresholds[ix]

print('Best threshold=%.3f' % (higher_limiter))

Best threshold=0.657


In [128]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [129]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

         Current function value: 0.248365
         Iterations: 35




0,1,2,3
Dep. Variable:,target,No. Observations:,51116.0
Model:,Logit,Df Residuals:,51096.0
Method:,MLE,Df Model:,19.0
Date:,"Mon, 24 May 2021",Pseudo R-squ.:,0.07871
Time:,10:58:54,Log-Likelihood:,-12695.0
converged:,False,LL-Null:,-13780.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
prev_CW/SA_rate,2.0787,0.221,9.386,0.000,1.645,2.513
S_create2SA_Create,0.0010,,,,,
S_Create2Start_Time,-0.0010,,,,,
SA_Create2Start_Time,-0.0012,,,,,
U_create2now,0.0057,0.006,0.878,0.380,-0.007,0.018
U_approve2now,-0.0277,0.007,-3.724,0.000,-0.042,-0.013
prev_CW x SA_rate,0.0001,2.66e-05,4.933,0.000,7.92e-05,0.000
type_RN,-3.5285,0.159,-22.237,0.000,-3.840,-3.217
type_LVN+LPN,-1.6249,0.084,-19.411,0.000,-1.789,-1.461


### Overfitting? No

In [130]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[31612 15605]
 [ 1278  2621]]


              precision    recall  f1-score   support

           0       0.96      0.67      0.79     47217
           1       0.14      0.67      0.24      3899

    accuracy                           0.67     51116
   macro avg       0.55      0.67      0.51     51116
weighted avg       0.90      0.67      0.75     51116



In [131]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(X_train)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[42030  5187]
 [ 2501  1398]]


              precision    recall  f1-score   support

           0       0.94      0.89      0.92     47217
           1       0.21      0.36      0.27      3899

    accuracy                           0.85     51116
   macro avg       0.58      0.62      0.59     51116
weighted avg       0.89      0.85      0.87     51116



### <font color = green> Validation result

In [132]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[718 197]
 [ 35  50]]


              precision    recall  f1-score   support

           0       0.95      0.78      0.86       915
           1       0.20      0.59      0.30        85

    accuracy                           0.77      1000
   macro avg       0.58      0.69      0.58      1000
weighted avg       0.89      0.77      0.81      1000



In [133]:
from sklearn.metrics import recall_score

label_coverage = y_pred.count(1)/len(y_pred)
UCW_coverage = recall_score(y_valid, y_pred)

print('The limiter we adopt is %.3f' % (limiter))
print('By covering %.3f labeled as high probability of UCW, we have prepared for %.3f of real UCW' 
      % (label_coverage,UCW_coverage))

The limiter we adopt is 0.657
By covering 0.247 labeled as high probability of UCW, we have prepared for 0.588 of real UCW


# Fit real data in this model

In [134]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)

In [135]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [136]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)
limiter = round(limiter,2)
realdata[['id', 'Start_Time', 'prob']].head(5)

Unnamed: 0,id,Start_Time,prob
55460,190132,2021-05-25 05:00:00,0.631839
74319,191733,2021-05-25 05:00:00,0.680283
74998,188967,2021-05-25 06:00:00,0.572462
59101,190536,2021-05-25 06:00:00,0.696872
61599,190827,2021-05-25 06:00:00,0.374523


In [137]:
# to make the prediction doesn't include today
realdata['Start_Time']

55460   2021-05-25 05:00:00
74319   2021-05-25 05:00:00
74998   2021-05-25 06:00:00
59101   2021-05-25 06:00:00
61599   2021-05-25 06:00:00
                ...        
53778   2021-06-27 18:00:00
55622   2021-06-27 22:30:00
55623   2021-06-28 22:30:00
38381   2021-06-29 18:00:00
66731   2021-06-30 22:30:00
Name: Start_Time, Length: 749, dtype: datetime64[ns]

In [138]:
high_prob = realdata[['id', 'Start_Time', 'prob']][realdata['prob'] > limiter]

## Append newly processed data to prediction data

In [139]:
import pandas as pd
# specify connection to database
import psycopg2
connection = psycopg2.connect(
    host="nursedash-prod.cuzi2kducsnv.us-east-1.rds.amazonaws.com",
    database="nursedash",
    user="external_analyst",
    password="uDps8APganhSLc3K2xe7NtMPq")

### <font color = green> all time to chicago time, No withdrawn info

In [140]:
df = pd.read_sql_query("""

SELECT  sa.id, sa.user_id, sa.shift_id, f.id AS facility_id, sa."withdrawnInfo" -> 'initiator' as withdrawnInfo_value,
sa."status", sa."prevStatus", sa."distance", s."facility_id", "s"."description" AS "shift_description",
"s"."assigned_nurse_id", s."net_pay", "s"."unit" AS "s_unit",s."type",
"s"."qualifications" AS "s_qualifications", "s"."breakTime" AS "s_breakTime", sa."withdrawnInfo",
"f"."name" AS "facility_name","f"."short_name" AS "f_short_name", f."segmentName", f."areaName",
timezone('America/Chicago', s."createdAt") as s_create,
timezone('America/Chicago', sa."createdAt") as sa_create,
timezone('America/Chicago', u."approvedAt") as u_approve,
timezone('America/Chicago', u."createdAt") as u_create,
timezone('America/Chicago', sa."statusUpdatedAt") as sa_statusUpdate,
timezone('America/Chicago', timezone('UTC', s.start_time)) AS "Start_Time" 
FROM shifts s
INNER JOIN shift_applications sa ON s.id = sa.shift_id
INNER JOIN facilities f ON s.facility_id = f.id
INNER JOIN users u ON sa.user_id = u.id

""", con = connection)

In [141]:
df.columns

Index(['id', 'user_id', 'shift_id', 'facility_id', 'withdrawninfo_value',
       'status', 'prevStatus', 'distance', 'facility_id', 'shift_description',
       'assigned_nurse_id', 'net_pay', 's_unit', 'type', 's_qualifications',
       's_breakTime', 'withdrawnInfo', 'facility_name', 'f_short_name',
       'segmentName', 'areaName', 's_create', 'sa_create', 'u_approve',
       'u_create', 'sa_statusupdate', 'Start_Time'],
      dtype='object')

In [142]:
def get_part_of_day(hour):
    return (
        "morning" if 4 < hour <= 12
        else
        "afternoon" if 12 < hour <= 17
        else
        "evening/night" if 18 < hour <= 22
        else
        "overnight"

    )

df['Start_time_of_the_day'] = df.apply(lambda row: get_part_of_day(row['Start_Time'].hour), axis =1)

# combine the prediction file with real data

In [143]:
# read the prediction file
prediction = realdata[['id', 'Start_Time', 'prob']]
validation = prediction.merge(df, on = 'id', how = 'left')

In [144]:
from datetime import date

today = date.today()

# convert to datetime for conditonal selection
validation['Start_Time_x'] = pd.to_datetime(validation['Start_Time_x'])

# only select date part of the time
validation['Start_Time_x'] = validation.apply(lambda row: str(row['Start_Time_x'].date()), axis = 1)

In [145]:
# rename start time
validation = validation.rename(columns={"Start_Time_x": "Start_Time"})

# limit our result to what we want as validation file
validation = validation[['id','prob','Start_Time','Start_time_of_the_day','status','type','prevStatus','areaName','segmentName','facility_name','user_id']]

In [146]:
validation = validation.set_index("id")
validation.columns

Index(['prob', 'Start_Time', 'Start_time_of_the_day', 'status', 'type',
       'prevStatus', 'areaName', 'segmentName', 'facility_name', 'user_id'],
      dtype='object')

In [147]:
validation.to_csv('pred_{}_Silver_Bullet.csv'.format(time))

In [148]:
validation

Unnamed: 0_level_0,prob,Start_Time,Start_time_of_the_day,status,type,prevStatus,areaName,segmentName,facility_name,user_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
190132,0.631839,2021-05-25,morning,confirmed,LPN,selected,Northeast Ohio,Senior Living,Avenue at Medina,12134
191733,0.680283,2021-05-25,morning,cancelled,LPN,confirmed,Northeast Ohio,Senior Living,Avenue at Broadview Heights,17949
188967,0.572462,2021-05-25,morning,confirmed,STNA,selected,Northeast Ohio,Senior Living,The Weils,18857
190536,0.696872,2021-05-25,morning,confirmed,LVN,selected,Austin,Senior Living,Austin Retirement and Nursing Center,12807
190827,0.374523,2021-05-25,morning,confirmed,CNA,selected,Houston,Senior Living,Carriage Inn Katy,13273
...,...,...,...,...,...,...,...,...,...,...
191350,0.100093,2021-06-27,overnight,confirmed,LVN,selected,Austin,Senior Living,Brookdale Northwest Hills,11726
187244,0.125801,2021-06-27,evening/night,confirmed,LVN,selected,DFW,Senior Living,The Hillcrest of North Dallas,12153
187245,0.120247,2021-06-28,evening/night,confirmed,LVN,selected,DFW,Senior Living,The Hillcrest of North Dallas,12153
188097,0.147520,2021-06-29,overnight,confirmed,LPN,selected,Northeast Ohio,Senior Living,Ohio Living Rockynol,9113


In [149]:
# select only northeast ohio and stna, and make a pivot table
# create a column called count
pivot_table = validation[(validation['areaName'] == 'Northeast Ohio') & 
           (validation['type'] == 'STNA') & (validation['prob'] > 0.55)].groupby(["Start_Time",
                                    "Start_time_of_the_day"]).size().reset_index(name='count').set_index("Start_Time")


In [150]:
# check if the count is above limiter of 5
# pivot_table['above_limiter'] = pivot_table.apply(lambda row: 2 if row['count'] >= 5 else 0, axis =1)

In [151]:
pivot_table.head(20)

Unnamed: 0_level_0,Start_time_of_the_day,count
Start_Time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-05-25,afternoon,1
2021-05-25,morning,4
2021-05-25,overnight,5
2021-05-26,afternoon,2
2021-05-26,morning,5
2021-05-26,overnight,3
2021-05-27,afternoon,2
2021-05-27,morning,4
2021-05-27,overnight,2
2021-05-28,afternoon,1


In [152]:
pivot_table.to_excel("plan2_pred_{}.xlsx".format(time))