In [472]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])


# Data Prepration

In [473]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['target', 'segmentName_d', 'type_d', 'createdAt', 'start_time'], axis=1))
scaled_features = scaler.transform(df.drop(['target', 'segmentName_d', 'type_d', 'createdAt', 'start_time'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'S_create2SA_Create', 'S_Create2Start_Time', 
                                             'SA_Create2Start_Time', 'U_create2now', 'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([X, df[['segmentName_d', 'type_d', 'target', 'createdAt', 'start_time']]], axis = 1)

# drop nas
df.dropna(inplace = True)

### set future data point as realdata

### note !!!: real data might overlap with train test validation data

In [474]:
df['start_time'] = pd.to_datetime(df['start_time'])
realdata = df[df['start_time'].isin(pd.date_range('2021-3-22', '2021-3-29'))]

###  <font color = green> Validation set: 1000 recently records

In [475]:
# slice
validation = df[-1000:]

y_valid = validation['target']
x_valid = validation.drop(['createdAt', 'start_time', 'target'], axis = 1)

y_valid.value_counts()

0    984
1     16
Name: target, dtype: int64

### Train test: main dataset - validation set

In [476]:
df = df[:-1000] # slice 

In [477]:
# make a dataset that num of tar = num of non tar, use it for train test
import random
df_tar = df[df['target']==1].reset_index(drop = True)
df_nontar = df[df['target']==0].reset_index(drop = True)

number_of_tar = df_tar.shape[0]
random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# concat
df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [478]:
X = df.drop(['target', 'createdAt', 'start_time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Logistic Regression 1  

In [479]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

logit = LogisticRegressionCV(scoring='roc_auc',max_iter=100000)
logit.fit(X_train,y_train)

LogisticRegressionCV(max_iter=100000, scoring='roc_auc')

### Train Test result

In [480]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = logit.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[869 410]
 [527 808]]


              precision    recall  f1-score   support

           0       0.62      0.68      0.65      1279
           1       0.66      0.61      0.63      1335

    accuracy                           0.64      2614
   macro avg       0.64      0.64      0.64      2614
weighted avg       0.64      0.64      0.64      2614



In [481]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [482]:
# logit summary
import statsmodels.api as sm
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.649908
         Iterations 5


0,1,2,3
Dep. Variable:,target,No. Observations:,6098.0
Model:,Logit,Df Residuals:,6089.0
Method:,MLE,Df Model:,8.0
Date:,"Sun, 21 Mar 2021",Pseudo R-squ.:,0.06232
Time:,15:55:43,Log-Likelihood:,-3963.1
converged:,True,LL-Null:,-4226.6
Covariance Type:,nonrobust,LLR p-value:,1.2280000000000001e-108

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
prev_CW/SA_rate,0.2595,0.030,8.773,0.000,0.202,0.317
S_create2SA_Create,1.2315,1.162,1.059,0.289,-1.047,3.510
S_Create2Start_Time,-1.9168,1.761,-1.088,0.276,-5.369,1.535
SA_Create2Start_Time,1.1024,1.273,0.866,0.387,-1.393,3.598
U_create2now,0.0145,0.078,0.185,0.853,-0.138,0.167
U_approve2now,-0.0132,0.078,-0.169,0.865,-0.166,0.140
net_pay,1.1402,0.073,15.659,0.000,0.997,1.283
segmentName_d,0.4228,0.040,10.580,0.000,0.345,0.501
type_d,-1.8379,0.115,-15.959,0.000,-2.064,-1.612


### Overfitting? No

In [483]:
y_pred = logit.predict(X_train)

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[2030 1047]
 [1190 1831]]


              precision    recall  f1-score   support

           0       0.63      0.66      0.64      3077
           1       0.64      0.61      0.62      3021

    accuracy                           0.63      6098
   macro avg       0.63      0.63      0.63      6098
weighted avg       0.63      0.63      0.63      6098



### <font color = green> Validation result

In [484]:
# test threshold
limiter = .6

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count+=1
    else:
        y_pred.append(0)

print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[790 194]
 [  1  15]]


              precision    recall  f1-score   support

           0       1.00      0.80      0.89       984
           1       0.07      0.94      0.13        16

    accuracy                           0.81      1000
   macro avg       0.54      0.87      0.51      1000
weighted avg       0.98      0.81      0.88      1000



In [485]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_valid, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

# Logistic Regression 2, pick only significant vars

###  <font color = green> Validation set

In [486]:
y_valid = validation['target']
x_valid = validation[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]

### Train Test set

In [487]:
X = df[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

### Fit

In [488]:
logit = LogisticRegressionCV(scoring='roc_auc')
logit.fit(X_train,y_train)

# logit summary
smlogit = sm.Logit(y_train,X_train).fit()
smlogit.summary()

Optimization terminated successfully.
         Current function value: 0.655628
         Iterations 5


0,1,2,3
Dep. Variable:,target,No. Observations:,6098.0
Model:,Logit,Df Residuals:,6094.0
Method:,MLE,Df Model:,3.0
Date:,"Sun, 21 Mar 2021",Pseudo R-squ.:,0.0541
Time:,15:55:44,Log-Likelihood:,-3998.0
converged:,True,LL-Null:,-4226.7
Covariance Type:,nonrobust,LLR p-value:,8.256e-99

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
prev_CW/SA_rate,0.2041,0.028,7.228,0.000,0.149,0.259
type_d,-1.8505,0.110,-16.890,0.000,-2.065,-1.636
segmentName_d,0.4914,0.038,12.788,0.000,0.416,0.567
net_pay,1.1639,0.068,17.087,0.000,1.030,1.297


In [489]:
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_test, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

In [490]:
y_pred= logit.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

[[853 472]
 [501 788]]


              precision    recall  f1-score   support

           0       0.63      0.64      0.64      1325
           1       0.63      0.61      0.62      1289

    accuracy                           0.63      2614
   macro avg       0.63      0.63      0.63      2614
weighted avg       0.63      0.63      0.63      2614



### Overfitting? No

In [491]:
y_pred= logit.predict(X_train)
from sklearn.metrics import classification_report,confusion_matrix

print(confusion_matrix(y_train, y_pred))
print('\n')
print(classification_report(y_train, y_pred))

[[1899 1132]
 [1128 1939]]


              precision    recall  f1-score   support

           0       0.63      0.63      0.63      3031
           1       0.63      0.63      0.63      3067

    accuracy                           0.63      6098
   macro avg       0.63      0.63      0.63      6098
weighted avg       0.63      0.63      0.63      6098



### <font color = green> Validation set result

In [492]:
# use same threshold
limiter = .8

y_prob = list(logit.predict_proba(x_valid)[:,1])
y_pred = []
count =0
for prob in y_prob:
    if prob >= limiter:
        y_pred.append(1)
        count += 1
    else:
        y_pred.append(0)
        
print(confusion_matrix(y_valid, y_pred))
print('\n')
print(classification_report(y_valid, y_pred))

[[976   8]
 [ 16   0]]


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       984
           1       0.00      0.00      0.00        16

    accuracy                           0.98      1000
   macro avg       0.49      0.50      0.49      1000
weighted avg       0.97      0.98      0.97      1000



In [493]:
# from cf_matrix import make_confusion_matrix
# labels = ['True Neg','False Pos','False Neg','True Pos']
# categories = ['Zero', 'One']
# make_confusion_matrix(confusion_matrix(y_valid, y_pred), 
#                       group_names=labels,
#                       categories=categories, 
#                       cmap='Blues')

# Fit real data in this model

In [509]:
# set input
real_X = realdata[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]

In [510]:
# concat predicted prob with data
realdata['prob'] = list(logit.predict_proba(real_X)[:,1])

In [513]:
realdata.reset_index().to_csv('week322_pred.csv')