In [1876]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

In [1877]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
       'areaName_dfw', 'areaName_austin', 'areaName_san', 'net_pay', 'target',
       'sa_create', 'Start_Time', 'CW_in_a_month', 'count_prev_SA',
       'count_prev_CW'],
      dtype='object')

# Data Prepration

In [1878]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month'], axis=1))
scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san', 'CW_in_a_month'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay', 'count_prev_SA', 'count_prev_CW'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### Slice df by the end of this week, for predcition output

In [1879]:
end_of_week = '2021-4-4'

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x >= pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

###  <font color = green> Validation set: 1000 recently records

In [1880]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'sa_create', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    895
1    105
Name: target, dtype: int64

### Train test: main dataset - validation set

In [1881]:
df = df[:-1000-realdata_len] # slice 

In [1882]:
# # make a dataset that num of tar = num of non tar, use it for train test
# import random
# df_tar = df[df['target']==1].reset_index(drop = True)
# df_nontar = df[df['target']==0].reset_index(drop = True)

# number_of_tar = df_tar.shape[0]
# random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
# df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# # concat
# df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [1883]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [1884]:
df['target'].value_counts()

0    58245
1     4748
Name: target, dtype: int64

# Logistic Regression 1  

In [1885]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data
logit = LogisticRegression(solver = 'lbfgs', class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced')

### Train Test result

In [1886]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[11298  6186]
 [  505   909]]


              precision    recall  f1-score   support

           0       0.96      0.65      0.77     17484
           1       0.13      0.64      0.21      1414

    accuracy                           0.65     18898
   macro avg       0.54      0.64      0.49     18898
weighted avg       0.90      0.65      0.73     18898



In [1887]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [1888]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.252635
         Iterations 7


0,1,2,3
Dep. Variable:,target,No. Observations:,44095.0
Model:,Logit,Df Residuals:,44077.0
Method:,MLE,Df Model:,17.0
Date:,"Sun, 04 Apr 2021",Pseudo R-squ.:,0.05703
Time:,20:36:51,Log-Likelihood:,-11140.0
converged:,True,LL-Null:,-11814.0
Covariance Type:,nonrobust,LLR p-value:,3.0689999999999996e-276

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-2.9122,0.163,-17.825,0.000,-3.232,-2.592
type_LVN+LPN,-1.2572,0.079,-15.956,0.000,-1.412,-1.103
segmentName_d,0.7219,0.100,7.250,0.000,0.527,0.917
areaName_houston,-2.9681,0.094,-31.489,0.000,-3.153,-2.783
areaName_no,-2.9771,0.103,-28.853,0.000,-3.179,-2.775
areaName_dfw,-2.9529,0.109,-27.204,0.000,-3.166,-2.740
areaName_austin,-2.8356,0.127,-22.369,0.000,-3.084,-2.587
areaName_san,-2.7099,0.136,-19.913,0.000,-2.977,-2.443
CW_in_a_month,0.1521,0.044,3.424,0.001,0.065,0.239


### Overfitting? No

In [1889]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[26384 14377]
 [ 1239  2095]]


              precision    recall  f1-score   support

           0       0.96      0.65      0.77     40761
           1       0.13      0.63      0.21      3334

    accuracy                           0.65     44095
   macro avg       0.54      0.64      0.49     44095
weighted avg       0.89      0.65      0.73     44095



### <font color = green> Validation result

In [1891]:
# test threshold
limiter = .8

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[881  14]
 [ 85  20]]


              precision    recall  f1-score   support

           0       0.91      0.98      0.95       895
           1       0.59      0.19      0.29       105

    accuracy                           0.90      1000
   macro avg       0.75      0.59      0.62      1000
weighted avg       0.88      0.90      0.88      1000



# Logistic Regression 2 Not good

###  <font color = green> Validation set

In [1808]:
# y_valid = validation['target']
# x_valid = validation[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 'areaName_dfw',
#                       'areaName_austin', 'areaName_san', 'net_pay',]]

### Train Test set

In [1809]:
# X = df[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 
#         'areaName_dfw','areaName_austin', 'areaName_san', 'net_pay',]]
# Y = df['target']
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

# Fit real data in this model

In [1894]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)

In [1895]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [1896]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)

realdata[['id', 'Start_Time', 'prob']].to_csv('pred_{}.csv'.format(time), index = False)

# Check real data in this model

In [1787]:
thisweek = pd.read_csv('download.csv')
thisweek = thisweek[thisweek.apply(lambda row: (row['prevStatus'] == 'confirmed') or (row['status'] == 'confirmed'), axis = 1)]

In [1363]:
realdata_test = realdata[['id', 'prob']]
thisweek_validation = thisweek.merge(realdata_test, on = 'id', how = 'left')
thisweek_validation.dropna(subset=['prob'],inplace = True)

In [1364]:
thisweek_validation[['prob']].isna().sum()

prob    0
dtype: int64

In [1365]:
thisweek_validation

def CW_by_nurse(row):
    if row['status']=='withdrawn' and row['prevStatus'] == 'confirmed':
        if row['withdrawnInfo_value'] == 'nurse':
            if row['CW_Time2Start_Time'] < 0 and row['CW_Time2Start_Time'] >= -24:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        return 0
    
thisweek_validation['target'] = thisweek_validation.apply (lambda row: CW_by_nurse(row), axis=1)

In [1366]:
thisweek_validation = thisweek_validation[['id', 'prob', 'target', 'net_pay', 'type', 'segmentName', 'areaName']]

# thisweek_validation.to_csv('week322_check.csv', index = False)

In [1367]:
limiter = .5


y_prob = thisweek_validation['prob']
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count += 1
    else:
        y_pred.append(0)
        
print(confusion_matrix(thisweek_validation['target'], y_pred))
print('\n')
print(classification_report(thisweek_validation['target'], y_pred))

[[328 762]
 [  3  17]]


              precision    recall  f1-score   support

           0       0.99      0.30      0.46      1090
           1       0.02      0.85      0.04        20

    accuracy                           0.31      1110
   macro avg       0.51      0.58      0.25      1110
weighted avg       0.97      0.31      0.45      1110



In [1296]:
thisweek_validation.groupby("target")['prob'].mean()

target
0    0.547651
1    0.606011
Name: prob, dtype: float64