In [156]:
import pandas as pd
import numpy as np
df = pd.read_csv('model_data.csv').drop(columns = ['Unnamed: 0'])

# Data Prepration

In [157]:
# standardlize, dont standardlize dummy! 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop(['target', 'segmentName_d', 'type_d', 'createdAt', 'start_time'], axis=1))
scaled_features = scaler.transform(df.drop(['target', 'segmentName_d', 'type_d', 'createdAt', 'start_time'], axis=1))

# scaled features
X = pd.DataFrame(scaled_features, columns = ['prev_CW/SA_rate', 'S_create2SA_Create', 'S_Create2Start_Time', 
                                             'SA_Create2Start_Time', 'U_create2now', 'U_approve2now', 'net_pay'])
# concat with dummy
df = pd.concat([X, df[['segmentName_d', 'type_d', 'target', 'createdAt', 'start_time']]], axis = 1)

# drop nas
df.dropna(inplace = True)

###  <font color = green> Validation set: 1000 recently records

In [158]:
# slice
validation = df[-1000:]

y_valid = validation['target']
x_valid = validation.drop(['createdAt', 'start_time', 'target'], axis = 1)

y_valid.value_counts()

0    969
1     31
Name: target, dtype: int64

### Train test: main dataset - validation set

In [159]:
df = df[:-1000] # slice 

In [160]:
# make a dataset that num of tar = num of non tar, use it for train test
import random
df_tar = df[df['target']==1].reset_index(drop = True)
df_nontar = df[df['target']==0].reset_index(drop = True)

number_of_tar = df_tar.shape[0]
random_indices = random.sample(range(len(df_nontar)), int(number_of_tar))
df_nontar = df_nontar[df_nontar.index.isin(random_indices)]

# concat
df = pd.concat([df_tar, df_nontar]).reset_index(drop = True)

In [161]:
X = df.drop(['target', 'createdAt', 'start_time'], axis = 1)
y = df['target']

# set test, train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Train the Support Vector Classifier

In [162]:
from sklearn.svm import SVC
svm = SVC()

svm.fit(X_train,y_train)

SVC()

### Train Test result

In [163]:
# pred
predictions = svm.predict(X_test)

# result
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[807 517]
 [432 848]]


              precision    recall  f1-score   support

           0       0.65      0.61      0.63      1324
           1       0.62      0.66      0.64      1280

    accuracy                           0.64      2604
   macro avg       0.64      0.64      0.64      2604
weighted avg       0.64      0.64      0.64      2604



### <font color = green> Validation result

In [164]:
# predict
predictions = svm.predict(x_valid)

print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[484 485]
 [  8  23]]


              precision    recall  f1-score   support

           0       0.98      0.50      0.66       969
           1       0.05      0.74      0.09        31

    accuracy                           0.51      1000
   macro avg       0.51      0.62      0.37      1000
weighted avg       0.95      0.51      0.64      1000



# Train the Support Vector Classifier with significant var by LR

###  <font color = green> Validation set

In [165]:
y_valid = validation['target']
x_valid = validation[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]

y_valid.value_counts()

0    969
1     31
Name: target, dtype: int64

### Train Test set

In [166]:
X = df[['prev_CW/SA_rate', 'type_d', 'segmentName_d', 'net_pay']]
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30)

In [167]:
from sklearn.svm import SVC
svm = SVC()

svm.fit(X_train,y_train)

SVC()

### Train Test result

In [169]:
# pred
predictions = svm.predict(X_test)

# result
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[838 439]
 [552 775]]


              precision    recall  f1-score   support

           0       0.60      0.66      0.63      1277
           1       0.64      0.58      0.61      1327

    accuracy                           0.62      2604
   macro avg       0.62      0.62      0.62      2604
weighted avg       0.62      0.62      0.62      2604



### <font color = green> Validation result

In [170]:
# predict
predictions = svm.predict(x_valid)

print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[553 416]
 [  7  24]]


              precision    recall  f1-score   support

           0       0.99      0.57      0.72       969
           1       0.05      0.77      0.10        31

    accuracy                           0.58      1000
   macro avg       0.52      0.67      0.41      1000
weighted avg       0.96      0.58      0.70      1000



# Gridsearch

Finding the right parameters (like what C or gamma values to use) is a tricky task! But luckily, we can be a little lazy and just try a bunch of combinations and see what works best! This idea of creating a 'grid' of parameters and just trying out all the possible combinations is called a Gridsearch, this method is common enough that Scikit-learn has this functionality built in with GridSearchCV! The CV stands for cross-validation which is the

GridSearchCV takes a dictionary that describes the parameters that should be tried and a model to train. The grid of parameters is defined as a dictionary, where the keys are the parameters and the values are the settings to be tested. 

In [171]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [172]:
# May take awhile!
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.630, total=   0.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.630, total=   0.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.7s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.628, total=   0.8s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.641, total=   0.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.642, total=   0.8s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.593, total=   0.8s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.627, total=   0.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.625, total=   0.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.632, total=   0.9s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.612, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.634, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.610, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.631, total=   1.0s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.627, total=   1.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.600, total=   1.0s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] ....... C=10, gamma=0.001, kernel=rbf, score=0.630, total=   0.9s
[CV] C=10, gamma=0.001, kernel=rbf ...................................
[CV] .

[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.639, total=   1.1s
[CV] C=1000, gamma=0.001, kernel=rbf .................................
[CV] ..... C=1000, gamma=0.001, kernel=rbf, score=0.631, total=   1.1s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.608, total=   0.9s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.630, total=   0.9s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.612, total=   0.9s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.632, total=   1.0s
[CV] C=1000, gamma=0.0001, kernel=rbf ................................
[CV] .... C=1000, gamma=0.0001, kernel=rbf, score=0.625, total=   0.9s


[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:  4.1min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [173]:
grid.best_params_

grid.best_estimator_

SVC(C=1000, gamma=1)

### Train Test result

In [174]:
grid_predictions = grid.predict(X_test)

print(confusion_matrix(y_test,grid_predictions))
print('\n')
print(classification_report(y_test,grid_predictions))

[[867 410]
 [543 784]]


              precision    recall  f1-score   support

           0       0.61      0.68      0.65      1277
           1       0.66      0.59      0.62      1327

    accuracy                           0.63      2604
   macro avg       0.64      0.63      0.63      2604
weighted avg       0.64      0.63      0.63      2604



### <font color = green> Validation result

In [175]:
# predict
predictions = grid.predict(x_valid)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_valid,predictions))
print('\n')
print(classification_report(y_valid,predictions))

[[607 362]
 [  8  23]]


              precision    recall  f1-score   support

           0       0.99      0.63      0.77       969
           1       0.06      0.74      0.11        31

    accuracy                           0.63      1000
   macro avg       0.52      0.68      0.44      1000
weighted avg       0.96      0.63      0.75      1000

