In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
import warnings
warnings.filterwarnings("ignore")

## SVC modeling

In [2]:
df_train = pd.read_csv('data/fashion_train.csv')

In [3]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,7927,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5212,0,0,0,0,0,0,0,0,0,...,23,0,0,0,0,0,0,0,0,0
2,28581,6,0,0,0,0,0,0,0,0,...,141,73,0,0,107,63,0,0,0,0
3,39089,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,52059,4,0,0,0,0,0,0,0,0,...,0,0,0,19,35,7,14,0,0,0


In [4]:
df_train_X = df_train.iloc[:, 2:]
df_train_y = df_train.iloc[:,[1]]

In [5]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
df_train_X_std = sc.fit_transform(df_train_X)

In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=100)
X_pca = pca.fit_transform(df_train_X_std)
X_pca

array([[ -1.54304021e+01,   5.54573032e-01,   4.35174122e+00, ...,
         -1.43991037e+00,   6.69215306e-01,  -7.54842145e-01],
       [  1.01256088e+01,  -9.70560802e+00,  -3.17121165e+00, ...,
          7.90448536e-01,  -1.98638966e-01,   3.36346706e-01],
       [  5.39428233e+00,  -1.64470091e+00,   8.38882864e+00, ...,
         -1.50610299e+00,   8.15034156e-01,  -9.84435028e-01],
       ..., 
       [ -7.92706124e+00,  -6.98684019e+00,   9.08083572e+00, ...,
          1.04845399e+00,  -1.86891974e+00,  -1.71130417e+00],
       [ -1.31043089e+01,   1.36278064e+00,  -4.55973087e-03, ...,
          9.31614366e-01,  -4.93993479e-01,  -3.76727822e-01],
       [ -8.78617306e+00,  -1.44585526e+01,  -3.71263998e-01, ...,
         -2.58283756e+00,   3.00606511e-01,  -7.93967288e-02]])

In [7]:
from sklearn.model_selection import train_test_split , KFold , cross_val_score
from sklearn.svm import SVC

In [8]:
def cv_score(model, X, y):
    cv = KFold(n_splits = 10, random_state = 0)
    print('{}'.format(model))
    print('cv mean accuracy score: {}'.format(np.mean(cross_val_score(model, X, y, scoring="accuracy", cv=cv))))

In [9]:
X_train_svc, X_test_svc, y_train_svc, y_test_svc = train_test_split(X_pca, df_train_y, test_size = 0.3, random_state = 0)

In [10]:
%%time
linersvc = SVC(kernel="linear",probability=True).fit(X_train_svc, y_train_svc)

CPU times: user 9min 57s, sys: 4 ms, total: 9min 57s
Wall time: 9min 57s


In [11]:
%%time
rbfsvc = SVC(kernel="rbf",probability=True).fit(X_train_svc, y_train_svc)

CPU times: user 10min 40s, sys: 20 ms, total: 10min 40s
Wall time: 10min 40s


In [12]:
%%time
sigmoidsvc = SVC(kernel="sigmoid",probability=True).fit(X_train_svc, y_train_svc)

CPU times: user 7min 31s, sys: 4 ms, total: 7min 31s
Wall time: 7min 31s


In [18]:
%%time
ploysvc = SVC(kernel="poly",probability=True).fit(X_train_svc, y_train_svc)

CPU times: user 2min 28s, sys: 28 ms, total: 2min 28s
Wall time: 2min 28s


In [13]:
%%time
cv_score(linersvc, X_test_svc, y_test_svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
cv mean accuracy score: 0.8392063492063491
CPU times: user 15min 55s, sys: 12 ms, total: 15min 55s
Wall time: 15min 55s


In [14]:
%%time
cv_score(rbfsvc, X_test_svc, y_test_svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
cv mean accuracy score: 0.8178571428571428
CPU times: user 18min 26s, sys: 12 ms, total: 18min 26s
Wall time: 18min 26s


In [15]:
%%time
cv_score(sigmoidsvc, X_test_svc, y_test_svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
cv mean accuracy score: 0.3715873015873016
CPU times: user 9min 12s, sys: 8 ms, total: 9min 12s
Wall time: 9min 12s


In [20]:
%%time
cv_score(ploysvc, X_test_svc, y_test_svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
cv mean accuracy score: 0.856904761904762
CPU times: user 3min 59s, sys: 12 ms, total: 3min 59s
Wall time: 3min 59s


## 주성분 수(n_components = 9)

In [10]:
%%time
rbfsvc = SVC(kernel="rbf",probability=True).fit(X_train_svc, y_train_svc)

CPU times: user 6min 12s, sys: 0 ns, total: 6min 12s
Wall time: 6min 12s


In [11]:
%%time
sigmoidsvc = SVC(kernel="sigmoid",probability=True).fit(X_train_svc, y_train_svc)

CPU times: user 1min 51s, sys: 8 ms, total: 1min 51s
Wall time: 1min 51s


In [13]:
cv_score(rbfsvc, X_test_svc, y_test_svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
kfold accuracy score: 0.7117460317460317


In [14]:
cv_score(sigmoidsvc, X_test_svc, y_test_svc)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
kfold accuracy score: 0.23444444444444446
