In [176]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

# Data Prepration

In [177]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt', 'start_time', 'type_RN', 
                    'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san'], axis=1))
scaled_features = scaler.transform(df.drop(['id', 'user_id', 'shift_id', 'target', 'createdAt',
                                            'start_time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                                            'areaName_houston', 'areaName_no', 'areaName_dfw', 
                                            'areaName_austin', 'areaName_san'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'prev_CW x SA_rate', 'S_create2SA_Create', 
                                             'S_Create2Start_Time', 'SA_Create2Start_Time', 'U_create2now', 
                                             'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([df[['id', 'user_id', 'shift_id', 'target', 'createdAt',
                    'start_time', 'type_RN', 'type_LVN+LPN', 'segmentName_d', 
                    'areaName_houston', 'areaName_no', 'areaName_dfw', 
                    'areaName_austin', 'areaName_san']], X], axis = 1)

# drop nas
df.dropna(inplace = True)

### set future data point as realdata

### note !!!: real data might overlap with train test validation data

In [178]:
df['start_time'] = pd.to_datetime(df['start_time'])
realdata = df[df['start_time'].isin(pd.date_range('2021-3-22', '2021-3-29'))]

###  <font color = green> Validation set: 1000 recently records

In [179]:
# slice
validation = df[-1000:]

y_valid = validation['target']
x_valid = validation.drop(['id','user_id', 'shift_id', 'createdAt', 'start_time', 'target'], axis = 1)

y_valid.value_counts()

0    984
1     16
Name: target, dtype: int64

### Train test: main dataset - validation set

In [180]:
df = df[:-1000] # slice 

In [181]:
# make a dataset that num of tar = num of non tar, use it for train test
import random
df_tar = df[df['target']==1].reset_index(drop = True)
df_nontar = df[df['target']==0].reset_index(drop = True)

number_of_tar = df_tar.shape[0]
random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# concat
df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [182]:
X = df.drop(['id','user_id', 'shift_id', 'target', 'createdAt', 'start_time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train the Support Vector Classifier

In [183]:
from sklearn.svm import SVC
svm = SVC()

svm.fit(X_train,y_train)

SVC()

### Train Test result

In [184]:
# pred
predictions = svm.predict(X_test)

# result
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[812 491]
 [435 876]]


              precision    recall  f1-score   support

           0       0.65      0.62      0.64      1303
           1       0.64      0.67      0.65      1311

    accuracy                           0.65      2614
   macro avg       0.65      0.65      0.65      2614
weighted avg       0.65      0.65      0.65      2614



### <font color = green> Validation result

In [185]:
# predict
predictions = svm.predict(x_valid)

print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[540 444]
 [  4  12]]


              precision    recall  f1-score   support

           0       0.99      0.55      0.71       984
           1       0.03      0.75      0.05        16

    accuracy                           0.55      1000
   macro avg       0.51      0.65      0.38      1000
weighted avg       0.98      0.55      0.70      1000



# Train the Support Vector Classifier with significant var by LR

In [186]:
y_valid = validation['target']
x_valid = validation[['prev_CW/SA_rate', 'net_pay', 'SA_Create2Start_Time', 'type_RN', 
                      'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
                      'areaName_dfw', 'areaName_austin', 'areaName_san']]

### Train Test set

In [187]:
X = df[['prev_CW/SA_rate', 'net_pay', 'SA_Create2Start_Time', 'type_RN', 
        'type_LVN+LPN', 'segmentName_d', 'areaName_houston', 'areaName_no',
        'areaName_dfw', 'areaName_austin', 'areaName_san']]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

### Fit

In [188]:
from sklearn.svm import SVC
svm = SVC()

svm.fit(X_train,y_train)

SVC()

### Train Test result

In [189]:
# pred
predictions = svm.predict(X_test)

# result
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[871 457]
 [491 795]]


              precision    recall  f1-score   support

           0       0.64      0.66      0.65      1328
           1       0.63      0.62      0.63      1286

    accuracy                           0.64      2614
   macro avg       0.64      0.64      0.64      2614
weighted avg       0.64      0.64      0.64      2614



### <font color = green> Validation result

In [191]:
# predict
predictions = svm.predict(x_valid)

print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[626 358]
 [  3  13]]


              precision    recall  f1-score   support

           0       1.00      0.64      0.78       984
           1       0.04      0.81      0.07        16

    accuracy                           0.64      1000
   macro avg       0.52      0.72      0.42      1000
weighted avg       0.98      0.64      0.76      1000



# Gridsearch

Finding the right parameters (like what C or gamma values to use) is a tricky task! But luckily, we can be a little lazy and just try a bunch of combinations and see what works best! This idea of creating a 'grid' of parameters and just trying out all the possible combinations is called a Gridsearch, this method is common enough that Scikit-learn has this functionality built in with GridSearchCV! The CV stands for cross-validation which is the

GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested. 

In [192]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [193]:
# May take awhile!
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.643, total=   1.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.630, total=   1.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.635, total=   1.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.639, total=   1.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.642, total=   1.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.607, total=   1.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.593, total=   1.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.617, total=   1.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.615, total=   1.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.622, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.597, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.607, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.611, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.623, total=   1.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.607, total=   1.1s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.583, total=   1.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.617, total=   1.4s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.630, total=   1.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.607, total=   1.3s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.583, total=   1.5s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.593, total=   1.3s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.600, total=   1.2s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.601, total=   1.2s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:  5.5min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [194]:
grid.best_params_

grid.best_estimator_

SVC(C=1, gamma=1)

### Train Test result

In [195]:
grid_predictions = grid.predict(X_test)

print(confusion_matrix(y_test,grid_predictions))
print('\n')
print(classification_report(y_test,grid_predictions))

[[870 458]
 [456 830]]


              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1328
           1       0.64      0.65      0.64      1286

    accuracy                           0.65      2614
   macro avg       0.65      0.65      0.65      2614
weighted avg       0.65      0.65      0.65      2614



### <font color = green> Validation result

In [196]:
# predict
predictions = grid.predict(x_valid)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[603 381]
 [  5  11]]


              precision    recall  f1-score   support

           0       0.99      0.61      0.76       984
           1       0.03      0.69      0.05        16

    accuracy                           0.61      1000
   macro avg       0.51      0.65      0.41      1000
weighted avg       0.98      0.61      0.75      1000

