In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0','f_highrate','f_lowrate'])

In [2]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
       'areaName_dfw', 'areaName_austin', 'areaName_san', 'net_pay', 'target',
       'sa_create', 'Start_Time', 'CW_in_a_month', 'count_prev_SA',
       'count_prev_CW'],
      dtype='object')

# Data Prepration

In [3]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month'], axis=1))

scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san', 'CW_in_a_month'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay', 'count_prev_SA', 'count_prev_CW'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### Slice df by the end of this week, for predcition output

In [4]:
end_of_week = '2021-4-15'

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x > pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

###  <font color = green> Validation set: 1000 recently records

In [5]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'sa_create', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    917
1     83
Name: target, dtype: int64

### Train test: main dataset - validation set

In [6]:
df = df[:-1000-realdata_len] # slice 

In [7]:
# # make a dataset that num of tar = num of non tar, use it for train test
# import random
# df_tar = df[df['target']==1].reset_index(drop = True)
# df_nontar = df[df['target']==0].reset_index(drop = True)

# number_of_tar = df_tar.shape[0]
# random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
# df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# # concat
# df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [8]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [9]:
df['target'].value_counts()

0    60067
1     4738
Name: target, dtype: int64

# Logistic Regression 1  

In [10]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data
logit = LogisticRegression(solver = 'lbfgs', max_iter=100000, class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced', max_iter=100000)

### Train Test result

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[11960  6070]
 [  484   928]]


              precision    recall  f1-score   support

           0       0.96      0.66      0.78     18030
           1       0.13      0.66      0.22      1412

    accuracy                           0.66     19442
   macro avg       0.55      0.66      0.50     19442
weighted avg       0.90      0.66      0.74     19442



In [12]:
from sklearn.metrics import roc_curve
from numpy import sqrt
from numpy import argmax

# predict probabilities
yhat = logit.predict_proba(X_test)
# keep probabilities for the positive outcome only
yhat = yhat[:, 1]

# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test,yhat)

# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))

# locate the index of the largest g-mean
ix = argmax(gmeans)

lower_limiter = thresholds[ix]
print('Best Threshold=%f' % (lower_limiter))

Best Threshold=0.498499


In [13]:
# search thresholds for imbalanced classification
from numpy import arange
from numpy import argmax
from sklearn.datasets import make_classification
from sklearn.metrics import f1_score
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
    return (pos_probs >= threshold).astype('int')

# predict probabilities
yhat = logit.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs = yhat[:, 1]
# define thresholds
thresholds = arange(0, 1, 0.001)
# evaluate each threshold
scores = [f1_score(y_test, to_labels(probs, t)) for t in thresholds]
# get best threshold
ix = argmax(scores)

higher_limiter = thresholds[ix]

print('Best threshold=%.3f' % (higher_limiter))

Best threshold=0.661


In [14]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [15]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.241496
         Iterations 8


0,1,2,3
Dep. Variable:,target,No. Observations:,45363.0
Model:,Logit,Df Residuals:,45345.0
Method:,MLE,Df Model:,17.0
Date:,"Wed, 14 Apr 2021",Pseudo R-squ.:,0.07876
Time:,17:32:50,Log-Likelihood:,-10955.0
converged:,True,LL-Null:,-11892.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-3.5668,0.173,-20.571,0.000,-3.907,-3.227
type_LVN+LPN,-1.4144,0.078,-18.234,0.000,-1.566,-1.262
segmentName_d,0.6272,0.098,6.396,0.000,0.435,0.819
areaName_houston,-2.9538,0.092,-32.018,0.000,-3.135,-2.773
areaName_no,-2.9029,0.102,-28.503,0.000,-3.103,-2.703
areaName_dfw,-2.9773,0.107,-27.834,0.000,-3.187,-2.768
areaName_austin,-2.8073,0.124,-22.591,0.000,-3.051,-2.564
areaName_san,-2.6938,0.129,-20.834,0.000,-2.947,-2.440
CW_in_a_month,0.2395,0.045,5.302,0.000,0.151,0.328


### Overfitting? No

In [16]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[27818 14219]
 [ 1120  2206]]


              precision    recall  f1-score   support

           0       0.96      0.66      0.78     42037
           1       0.13      0.66      0.22      3326

    accuracy                           0.66     45363
   macro avg       0.55      0.66      0.50     45363
weighted avg       0.90      0.66      0.74     45363



In [17]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(X_train)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[37489  4548]
 [ 2230  1096]]


              precision    recall  f1-score   support

           0       0.94      0.89      0.92     42037
           1       0.19      0.33      0.24      3326

    accuracy                           0.85     45363
   macro avg       0.57      0.61      0.58     45363
weighted avg       0.89      0.85      0.87     45363



### <font color = green> Validation result

In [46]:
# test threshold
limiter = higher_limiter

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[720 197]
 [ 39  44]]


              precision    recall  f1-score   support

           0       0.95      0.79      0.86       917
           1       0.18      0.53      0.27        83

    accuracy                           0.76      1000
   macro avg       0.57      0.66      0.57      1000
weighted avg       0.89      0.76      0.81      1000



In [48]:
from sklearn.metrics import recall_score

label_coverage = y_pred.count(1)/len(y_pred)
UCW_coverage = recall_score(y_valid, y_pred)

print('The limiter we adopt is %.3f' % (limiter))
print('By covering %.3f labeled as high probability of UCW, we have prepared for %.3f of real UCW' 
      % (label_coverage,UCW_coverage))

The limiter we adopt is 0.661
By covering 0.241 labeled as high probability of UCW, we have prepared for 0.530 of real UCW


# Fit real data in this model

In [19]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)

In [20]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [21]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)

realdata[['id', 'Start_Time', 'prob']].to_csv('pred_{}_Silver_Bullet.csv'.format(time), index = False)

In [22]:
# to make the prediction doesn't include today
realdata['Start_Time']

66153   2021-04-15 05:00:00
55548   2021-04-15 05:00:00
66041   2021-04-15 05:00:00
22690   2021-04-15 05:45:00
66873   2021-04-15 06:00:00
                ...        
66809   2021-05-22 06:30:00
42945   2021-05-23 06:30:00
42946   2021-05-24 06:30:00
66810   2021-05-25 06:30:00
42947   2021-06-01 06:30:00
Name: Start_Time, Length: 688, dtype: datetime64[ns]