# Importy

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
df = pd.read_csv("Data_User_Modeling_Dataset.csv", header=0, names=['A', 'B', 'C', 'D', 'E', 'F'], index_col=False)
df

Unnamed: 0,A,B,C,D,E,F
0,0.08,0.08,0.10,0.24,0.90,3
1,0.06,0.06,0.05,0.25,0.33,1
2,0.10,0.10,0.15,0.65,0.30,2
3,0.08,0.08,0.08,0.98,0.24,1
4,0.09,0.15,0.40,0.10,0.66,2
...,...,...,...,...,...,...
252,0.61,0.78,0.69,0.92,0.58,3
253,0.78,0.61,0.71,0.19,0.60,2
254,0.54,0.82,0.71,0.29,0.77,3
255,0.50,0.75,0.81,0.61,0.26,2


### Podział na zbiór treningowy i testowy

In [2]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('F', axis=1), df['F'], random_state=2, test_size=0.3)
X_train

Unnamed: 0,A,B,C,D,E
134,0.410,0.180,0.33,0.31,0.50
229,0.730,0.430,0.32,0.12,0.65
151,0.360,0.290,0.37,0.48,0.13
90,0.320,0.255,0.55,0.78,0.34
120,0.255,0.750,0.35,0.72,0.25
...,...,...,...,...,...
22,0.180,0.310,0.32,0.42,0.28
72,0.300,0.200,0.52,0.30,0.53
237,0.620,0.370,0.81,0.13,0.64
15,0.050,0.070,0.70,0.01,0.05


### podstawowy model, jego skutecznosć

In [3]:
base_model = svm.SVC(kernel='linear', C=1.0)
base_model.fit(X_train, y_train)
base_model.score(X_test, y_test)

0.7948717948717948

### Testowanie różnych parametrów

In [4]:
svc = svm.SVC()
params = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': np.linspace(.1, 10, 30),
    'gamma': np.linspace(.1, 1, 30),
    'degree': np.linspace(1, 5, 5),
}
svcGS = GridSearchCV(svc, params, cv=4, verbose=1, n_jobs=-1)
svcGS.fit(X_train, y_train)
print("Best params:", svcGS.best_params_)
print("Best score", svcGS.best_score_)
svcGS.score(X_test, y_test)
print(svcGS.score(X_test, y_test))

svcRS = RandomizedSearchCV(svc, params, cv=4, random_state=2, verbose=1, n_jobs=-1)
svcRS.fit(X_train, y_train)
print('Best params: ' + str(svcRS.best_params_))
print('Best score: ' + str(svcRS.best_score_))
svcRS.score(X_test, y_test)
print(svcRS.score(X_test, y_test))

Fitting 4 folds for each of 13500 candidates, totalling 54000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 2260 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 6260 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 11860 tasks      | elapsed:   47.2s
[Parallel(n_jobs=-1)]: Done 19060 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 27860 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 38260 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 50260 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 54000 out of 54000 | elapsed:  3.0min finished


Best params: {'C': 9.658620689655173, 'degree': 1.0, 'gamma': 1.0, 'kernel': 'rbf'}
Best score 0.9555555555555556
0.8974358974358975
Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best params: {'kernel': 'rbf', 'gamma': 0.5655172413793104, 'degree': 2.0, 'C': 5.562068965517241}
Best score: 0.921969696969697
0.9102564102564102


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished


Skuteczność: GridSearchCV ~0.90, RandomizedSearchCV ~0.91

### Testowanie parametrów z mniejszymi zakresami

In [5]:
params2 = {
    'kernel': ('linear', 'rbf', 'poly'),
    'C': np.linspace(2, 3, 5),
    'gamma': np.linspace(0.9, 1.0, 10),
}
svcGS = GridSearchCV(svc, params2, cv=4, verbose=1, n_jobs=-1)
svcGS.fit(X_train, y_train)
print("Best params:", svcGS.best_params_)
print("Best score:", svcGS.best_score_)
svcGS.score(X_test, y_test)
print(svcGS.score(X_test, y_test))

svcRS = RandomizedSearchCV(svc, params2, cv=4, random_state=2, verbose=1, n_jobs=-1)
svcRS.fit(X_train, y_train)
print('Best params: ' + str(svcRS.best_params_))
print('Best score: ' + str(svcRS.best_score_))
svcRS.score(X_test, y_test)
print(svcRS.score(X_test, y_test))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 4 folds for each of 150 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    2.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best params: {'C': 3.0, 'gamma': 0.9666666666666667, 'kernel': 'rbf'}
Best score: 0.921969696969697
0.9102564102564102
Fitting 4 folds for each of 10 candidates, totalling 40 fits
Best params: {'kernel': 'linear', 'gamma': 0.9333333333333333, 'C': 3.0}
Best score: 0.899621212121212
0.8333333333333334


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.2s finished


Skuteczność: GridSearchCV ~0.91 (+), RandomizedSearchCV ~0.83 (-) ( w przypadku GridSearchCV, szybciej i lepiej)

### Wnioski

Najlepsze parametry zostały znalezione za pomocą GridSearchCV, skuteczność modelu została podniesiona z 79.5% do 91%.