# Task 3: Predictive Analysis

In [41]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [42]:
customer_profile = pd.read_csv('../dataset/customer_profile_kmeans.csv', sep='\t', index_col=0)
customer_profile

Unnamed: 0,I,Iu,Imax,E,Savg,R,F,M,label
0,0.870894,0.017546,-0.038273,0.554834,0.336561,-0.500054,0.512319,1.268486,high
1,3.043268,3.259964,2.254020,0.477900,-0.675782,-1.680426,3.536461,2.969024,high
2,1.207900,1.196320,1.546322,0.487575,1.414379,-0.500054,1.134077,1.463197,high
3,0.746231,0.533552,0.549481,0.286241,0.292491,-0.043259,0.425503,0.587988,high
4,-1.026156,-1.441491,-0.802301,-0.908068,-1.156208,0.473247,-1.441955,-1.426696,low
...,...,...,...,...,...,...,...,...,...
3677,-1.436179,-1.051384,-1.363713,-1.031710,-0.385789,0.821730,-1.083852,-0.949103,low
3678,-1.233447,-1.441491,-1.086128,-1.070447,-1.778062,-1.680426,-1.441955,-1.812192,low
3679,-1.248452,-1.051384,-1.819487,0.028752,-1.535158,1.026516,-1.083852,-1.103305,low
3680,1.283552,1.831664,0.536392,-0.058993,-0.507053,-1.064892,2.339313,1.208956,high


In [43]:
customer_profile = customer_profile.to_numpy()

In [51]:
def highlight_equal(s, value, column):
    is_max = pd.Series(data=False, index=s.index)
    is_max[column] = s.loc[column] == value
    return ['background-color: lightyellow' if is_max.any() else '' for v in is_max]

## SVC

In [52]:
from sklearn.svm import SVC

In [53]:
X, y = customer_profile[:, :-1], customer_profile[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

gamma_range = ['auto', 'scale']
C_range = [0.1, 1, 10, 100, 1000]

tuned_parameters = {'kernel': ('linear', 'poly', 'rbf'),
                    'gamma': gamma_range,
                    'C': C_range}

grid = GridSearchCV(SVC(),
                    param_grid=tuned_parameters,
                    scoring='accuracy',
                    cv=5,  # 5 fold cross validation
                    n_jobs=-1,  # use all processors
                    refit=True,  # refit the best model on the full development set
                    return_train_score=True,
                    verbose=True).fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:    1.4s finished


In [54]:
df = pd.DataFrame(grid.cv_results_)[['params', 'mean_train_score', 'mean_test_score', 'rank_test_score']].sort_values(by='rank_test_score')
df.rename(columns={'mean_test_score': 'mean_val_score', 
                   'rank_test_score': 'rank_val_score'}, inplace=True)

df.head(15).style.apply(highlight_equal, value=1, column=['rank_val_score'], axis=1).hide_index()

params,mean_train_score,mean_val_score,rank_val_score
"{'C': 1000, 'gamma': 'scale', 'kernel': 'linear'}",0.999819,0.997464,1
"{'C': 1000, 'gamma': 'auto', 'kernel': 'linear'}",0.999819,0.997464,1
"{'C': 100, 'gamma': 'scale', 'kernel': 'linear'}",0.998642,0.995653,3
"{'C': 100, 'gamma': 'auto', 'kernel': 'linear'}",0.998642,0.995653,3
"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",0.998189,0.99493,5
"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",0.998189,0.99493,5
"{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}",0.995382,0.99203,7
"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}",0.995382,0.99203,7
"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.99665,0.991669,9
"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.99665,0.991307,10


In [55]:
accuracy_score(grid.best_estimator_.predict(X_test), y_test)

0.998914223669924