In [22]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

In [23]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
       'areaName_dfw', 'areaName_austin', 'areaName_san', 'net_pay', 'target',
       'sa_create', 'Start_Time', 'CW_in_a_month', 'count_prev_SA',
       'count_prev_CW', 'f_highrate', 'f_lowrate'],
      dtype='object')

# Data Prepration

In [24]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month','f_highrate','f_lowrate'], axis=1))

scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san', 'CW_in_a_month',
                                           'f_highrate','f_lowrate'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay', 'count_prev_SA', 'count_prev_CW'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month',
                   'f_highrate','f_lowrate']], X], axis = 1)
# drop nas
df.dropna(inplace = True)

### Slice df by the end of this week, for predcition output

In [25]:
end_of_week = '2021-4-14'

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x > pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

###  <font color = green> Validation set: 1000 recently records

In [26]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'sa_create', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    902
1     98
Name: target, dtype: int64

### Train test: main dataset - validation set

In [27]:
df = df[:-1000-realdata_len] # slice 

In [28]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Logistic Regression 1  

In [29]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data

logit = LogisticRegression(solver = 'lbfgs', max_iter=100000, class_weight = weights)
logit.fit(X_train,y_train)

LogisticRegression(class_weight={0: 1, 1: 10}, max_iter=100000)

### Find the optimal limiters immidiately after we create the model

In [30]:
from sklearn.metrics import roc_curve
from numpy import sqrt
from numpy import argmax

# predict probabilities
yhat = logit.predict_proba(X_test)
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test,yhat)

# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))

# locate the index of the largest g-mean
ix = argmax(gmeans)

lower_limiter = thresholds[ix]
print('Best Threshold=%f' % (lower_limiter))

Best Threshold=0.441296


In [31]:
# search thresholds for imbalanced classification
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

# predict probabilities
yhat = logit.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = yhat[:, 1]
# define thresholds
thresholds = arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score(y_test, to_labels(probs, t)) for t in thresholds]
# get best threshold
ix = argmax(scores)

higher_limiter = thresholds[ix]

print('Best threshold=%.3f' % (higher_limiter))

Best threshold=0.574


### Train Test result

In [32]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[13766  4218]
 [  625   784]]


              precision    recall  f1-score   support

           0       0.96      0.77      0.85     17984
           1       0.16      0.56      0.24      1409

    accuracy                           0.75     19393
   macro avg       0.56      0.66      0.55     19393
weighted avg       0.90      0.75      0.81     19393



In [33]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.240541
         Iterations 9


0,1,2,3
Dep. Variable:,target,No. Observations:,45250.0
Model:,Logit,Df Residuals:,45230.0
Method:,MLE,Df Model:,19.0
Date:,"Wed, 14 Apr 2021",Pseudo R-squ.:,0.08089
Time:,18:18:31,Log-Likelihood:,-10884.0
converged:,True,LL-Null:,-11842.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-3.5155,0.171,-20.502,0.000,-3.852,-3.179
type_LVN+LPN,-1.4427,0.078,-18.460,0.000,-1.596,-1.289
segmentName_d,0.6055,0.098,6.191,0.000,0.414,0.797
areaName_houston,-2.9285,0.092,-31.868,0.000,-3.109,-2.748
areaName_no,-2.9129,0.102,-28.625,0.000,-3.112,-2.713
areaName_dfw,-2.8712,0.106,-26.997,0.000,-3.080,-2.663
areaName_austin,-2.7844,0.125,-22.206,0.000,-3.030,-2.539
areaName_san,-2.6568,0.131,-20.320,0.000,-2.913,-2.401
CW_in_a_month,0.1780,0.045,3.923,0.000,0.089,0.267


### Overfitting? No

In [34]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[31986  9954]
 [ 1454  1856]]


              precision    recall  f1-score   support

           0       0.96      0.76      0.85     41940
           1       0.16      0.56      0.25      3310

    accuracy                           0.75     45250
   macro avg       0.56      0.66      0.55     45250
weighted avg       0.90      0.75      0.80     45250



In [35]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(X_train)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[36192  5748]
 [ 1991  1319]]


              precision    recall  f1-score   support

           0       0.95      0.86      0.90     41940
           1       0.19      0.40      0.25      3310

    accuracy                           0.83     45250
   macro avg       0.57      0.63      0.58     45250
weighted avg       0.89      0.83      0.86     45250



### <font color = green> Validation result

In [48]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[671 231]
 [ 37  61]]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83       902
           1       0.21      0.62      0.31        98

    accuracy                           0.73      1000
   macro avg       0.58      0.68      0.57      1000
weighted avg       0.88      0.73      0.78      1000



In [49]:
from sklearn.metrics import recall_score

label_coverage = y_pred.count(1)/len(y_pred)
UCW_coverage = recall_score(y_valid, y_pred)

print('The limiter we adopt is %.2f' % (limiter))
print('By covering %.3f labeled as high probability of UCW, we have prepared for %.3f of real UCW' 
      % (label_coverage,UCW_coverage))

The limiter we adopt is 0.57
By covering 0.292 labeled as high probability of UCW, we have prepared for 0.622 of real UCW


## Logistic Regression（trained for houston/northeast ohio)

###  <font color = green> Validation set

In [38]:
# y_valid = validation['target']
# x_valid = validation[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 'areaName_dfw',
#                       'areaName_austin', 'areaName_san', 'net_pay',]]

### Train Test set

In [39]:
# X = df[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 
#         'areaName_dfw','areaName_austin', 'areaName_san', 'net_pay',]]
# Y = df['target']
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

# Fit real data in this model

In [40]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)

In [41]:
X.columns

Index(['type_RN', 'type_LVN+LPN', 'segmentName_d', 'areaName_houston',
       'areaName_no', 'areaName_dfw', 'areaName_austin', 'areaName_san',
       'CW_in_a_month', 'f_highrate', 'f_lowrate', 'prev_CW/SA_rate',
       'prev_CW x SA_rate', 'S_create2SA_Create', 'S_Create2Start_Time',
       'SA_Create2Start_Time', 'U_create2now', 'U_approve2now', 'net_pay',
       'count_prev_SA', 'count_prev_CW'],
      dtype='object')

In [42]:
real_X.columns

Index(['type_RN', 'type_LVN+LPN', 'segmentName_d', 'areaName_houston',
       'areaName_no', 'areaName_dfw', 'areaName_austin', 'areaName_san',
       'CW_in_a_month', 'f_highrate', 'f_lowrate', 'prev_CW/SA_rate',
       'prev_CW x SA_rate', 'S_create2SA_Create', 'S_Create2Start_Time',
       'SA_Create2Start_Time', 'U_create2now', 'U_approve2now', 'net_pay',
       'count_prev_SA', 'count_prev_CW'],
      dtype='object')

In [43]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [44]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)

realdata[['id', 'Start_Time', 'prob']].to_csv('pred_{}_Golden_Bullet.csv'.format(time), index = False)

In [45]:
# to make the prediction doesn't include today
realdata['Start_Time']

66040   2021-04-14 05:00:00
66152   2021-04-14 05:00:00
63565   2021-04-14 05:00:00
66503   2021-04-14 05:00:00
52308   2021-04-14 05:00:00
                ...        
66809   2021-05-22 06:30:00
42945   2021-05-23 06:30:00
42946   2021-05-24 06:30:00
66810   2021-05-25 06:30:00
42947   2021-06-01 06:30:00
Name: Start_Time, Length: 812, dtype: datetime64[ns]