In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

In [2]:
df.columns

Index(['id', 'user_id', 'shift_id', 'prev_CW/SA_rate', 'status',
       'S_create2SA_Create', 'S_Create2Start_Time', 'SA_Create2Start_Time',
       'U_create2now', 'U_approve2now', 'prev_CW x SA_rate', 'type_RN',
       'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
       'areaName_dfw', 'areaName_austin', 'areaName_san', 'net_pay', 'target',
       'sa_create', 'Start_Time', 'CW_in_a_month', 'count_prev_SA',
       'count_prev_CW', 'f_highrate', 'f_lowrate'],
      dtype='object')

# Data Prepration

In [3]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month','f_highrate','f_lowrate'], axis=1))

scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                                            'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san', 'CW_in_a_month',
                                           'f_highrate','f_lowrate'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay', 'count_prev_SA', 'count_prev_CW'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'status', 'target', 'sa_create',
                    'Start_Time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san', 'CW_in_a_month',
                   'f_highrate','f_lowrate']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### Slice df by the end of this week, for predcition output

In [4]:
end_of_week = '2021-4-12'

# convert to datetime for conditonal selection
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
# sort by start time -> for slicing
df = df.sort_values(by = 'Start_Time') 
# record as realdata
realdata = df[df['Start_Time'].apply(lambda x: x > pd.to_datetime(end_of_week))]
# record predction output rows, don't include it in tran test validation
realdata_len = realdata.shape[0]
# only keep status = confirmed
realdata = realdata[realdata['status'] == 'confirmed']

###  <font color = green> Validation set: 1000 recently records

In [5]:
# slice, dont include realdata
validation = df[-1000-realdata_len : -realdata_len]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'status', 'sa_create', 'Start_Time', 'target'], axis = 1)

y_valid.value_counts()

0    906
1     94
Name: target, dtype: int64

### Train test: main dataset - validation set

In [6]:
df = df[:-1000-realdata_len] # slice 

In [7]:
X = df.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [8]:
df['target'].value_counts()

0    59611
1     4695
Name: target, dtype: int64

# Logistic Regression 1  

In [9]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# assign less punlishment for classifying 0 as 1 -> find more 1's
# weights = {0:1, 1:10}
# class_weight = 'balanced': automatically adjust weights inversely proportional to class frequencies in the input data

logit = LogisticRegression(solver = 'lbfgs', max_iter=100000, class_weight = 'balanced')
logit.fit(X_train,y_train)

LogisticRegression(class_weight='balanced', max_iter=100000)

### Train Test result

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[11745  6147]
 [  445   955]]


              precision    recall  f1-score   support

           0       0.96      0.66      0.78     17892
           1       0.13      0.68      0.22      1400

    accuracy                           0.66     19292
   macro avg       0.55      0.67      0.50     19292
weighted avg       0.90      0.66      0.74     19292



In [11]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.241521
         Iterations 12


0,1,2,3
Dep. Variable:,target,No. Observations:,45014.0
Model:,Logit,Df Residuals:,44994.0
Method:,MLE,Df Model:,19.0
Date:,"Sun, 11 Apr 2021",Pseudo R-squ.:,0.07759
Time:,22:26:12,Log-Likelihood:,-10872.0
converged:,True,LL-Null:,-11786.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
type_RN,-3.2658,0.168,-19.437,0.000,-3.595,-2.936
type_LVN+LPN,-1.3014,0.077,-16.976,0.000,-1.452,-1.151
segmentName_d,0.5461,0.096,5.704,0.000,0.358,0.734
areaName_houston,-2.9046,0.090,-32.216,0.000,-3.081,-2.728
areaName_no,-2.9225,0.100,-29.179,0.000,-3.119,-2.726
areaName_dfw,-2.8424,0.105,-27.126,0.000,-3.048,-2.637
areaName_austin,-2.6749,0.123,-21.715,0.000,-2.916,-2.433
areaName_san,-2.6203,0.130,-20.183,0.000,-2.875,-2.366
CW_in_a_month,0.1857,0.045,4.104,0.000,0.097,0.274


### Overfitting? No

In [12]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[27283 14436]
 [ 1122  2173]]


              precision    recall  f1-score   support

           0       0.96      0.65      0.78     41719
           1       0.13      0.66      0.22      3295

    accuracy                           0.65     45014
   macro avg       0.55      0.66      0.50     45014
weighted avg       0.90      0.65      0.74     45014



In [13]:
# test threshold
limiter = .65

y_prob = list(logit.predict_proba(X_train)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[36891  4828]
 [ 2151  1144]]


              precision    recall  f1-score   support

           0       0.94      0.88      0.91     41719
           1       0.19      0.35      0.25      3295

    accuracy                           0.84     45014
   macro avg       0.57      0.62      0.58     45014
weighted avg       0.89      0.84      0.86     45014



### <font color = green> Validation result

In [14]:
# test threshold
limiter = .65

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[699 207]
 [ 44  50]]


              precision    recall  f1-score   support

           0       0.94      0.77      0.85       906
           1       0.19      0.53      0.28        94

    accuracy                           0.75      1000
   macro avg       0.57      0.65      0.57      1000
weighted avg       0.87      0.75      0.79      1000



# Logistic Regression 2 Not good

###  <font color = green> Validation set

In [15]:
# y_valid = validation['target']
# x_valid = validation[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 'areaName_dfw',
#                       'areaName_austin', 'areaName_san', 'net_pay',]]

### Train Test set

In [16]:
# X = df[['type_RN', 'type_LVN+LPN', 'areaName_houston', 'areaName_no', 
#         'areaName_dfw','areaName_austin', 'areaName_san', 'net_pay',]]
# Y = df['target']
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

# Fit real data in this model

In [17]:
# set input
real_X = realdata.drop(['id','user_id', 'shift_id', 'status', 'target', 'sa_create', 'Start_Time'], axis = 1)

In [18]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [19]:
# record when this prediction is ran
from datetime import date
time = str(date.today().year) + '-' + str(date.today().month) + '-' + str(date.today().day)

realdata[['id', 'Start_Time', 'prob']].to_csv('pred_{}_Golden_Bullet.csv'.format(time), index = False)

In [20]:
# to make the prediction doesn't include today
realdata['Start_Time']

41768   2021-04-12 05:00:00
46540   2021-04-12 05:00:00
55352   2021-04-12 05:00:00
66259   2021-04-12 05:00:00
65792   2021-04-12 05:00:00
                ...        
66399   2021-05-08 06:30:00
64668   2021-05-08 18:00:00
66400   2021-05-11 06:30:00
66401   2021-05-15 06:30:00
66402   2021-05-22 06:30:00
Name: Start_Time, Length: 770, dtype: datetime64[ns]