# SVM

In [1]:
# data from: http://archive.ics.uci.edu/ml/machine-learning-databases/wine/
import pandas as pd 
import numpy as np

### Importing Data

In [2]:
'''
1) Alcohol
2) Malic acid
3) Ash
4) Alcalinity of ash  
5) Magnesium
6) Total phenols
7) Flavanoids
8) Nonflavanoid phenols
9) Proanthocyanins
10) Color intensity
11) Hue
12) OD280/OD315 of diluted wines
13) Proline
'''

'\n1) Alcohol\n2) Malic acid\n3) Ash\n4) Alcalinity of ash  \n5) Magnesium\n6) Total phenols\n7) Flavanoids\n8) Nonflavanoid phenols\n9) Proanthocyanins\n10) Color intensity\n11) Hue\n12) OD280/OD315 of diluted wines\n13) Proline\n'

### Data Cleasing and Preparation

In [3]:
# Creating header
colnames = ['Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoid',\
            'Nonflavanoid phenols','Proanthocyanis','Color Intensity', 'Hue','OD280/OD315','Proline']

In [4]:
# Reading csv and concatenating the header
wineData = pd.read_csv('wine.data.csv', names=colnames, index_col=False)

In [5]:
# Glimpse on the data
wineData.head(4)

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoid,Nonflavanoid phenols,Proanthocyanis,Color Intensity,Hue,OD280/OD315,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45


In [6]:
# Summary of the dataset
wineData.describe()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoid,Nonflavanoid phenols,Proanthocyanis,Color Intensity,Hue,OD280/OD315,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685
std,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999
min,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27
25%,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375
50%,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78
75%,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17
max,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0


In [7]:
# Importing library
from sklearn import svm, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import normalize

In [8]:
# Normalizing 
# df_norm = (df - df.mean()) / (df.max() - df.min())
wineData_norm_d = wineData.Alcohol
wineData_norm = (wineData.iloc[:,1:] - wineData.iloc[:,1:].mean()) / (wineData.iloc[:,1:].max() - wineData.iloc[:,1:].min())
wineData_norm['Alcohol'] = wineData_norm_d

In [9]:
wineData_norm.head(4)

Unnamed: 0,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoid,Nonflavanoid phenols,Proanthocyanis,Color Intensity,Hue,OD280/OD315,Proline,Alcohol
0,0.323522,-0.123784,0.033948,-0.20077,0.296287,0.174099,0.217454,-0.154441,0.220537,0.049651,0.067114,0.479236,1
1,0.052469,-0.10995,-0.121132,-0.427574,0.002809,0.122375,0.154163,-0.192177,-0.098075,-0.057857,0.075244,0.28876,1
2,0.041943,0.004674,0.16229,-0.046131,0.013679,0.174099,0.255428,-0.116706,0.384574,0.053064,0.058984,0.204511,1
3,0.360364,-0.076353,0.071381,-0.138915,0.144113,0.536168,0.308171,-0.229913,0.185836,0.233951,-0.079227,0.307075,1


#### Modeling

In [52]:
# Instantiating the classification object 

# Linear
def SVM(kernel, c, gamma):
    trainData, testData = train_test_split(wineData_norm, test_size = 0.3)
    
    # Transforming the data into a readable format for sklearn

    # Loading csv file into a readable format for sklearn
    #data =  np.loadtxt(fname = 'wine.data.csv', delimiter = ',')

    ### Train ###

    yTrain = trainData.Alcohol.tolist()

    
    # Getting the predictors to have the right format in arrays
    X_Train = trainData.iloc[:,:-1].as_matrix()
    
    ### Test ###

    yTest = testData.Alcohol.tolist()

    # Gettin the predictors to have the right format in arrays
    X_Test = testData.iloc[:,:-1].as_matrix()
    
    ### Modeling ###

    res = svm.SVC(kernel=kernel, C=c, gamma=gamma, decision_function_shape='ovr')

    res.fit(X_Train, yTrain)

    # Coefficients
    #w = res.coef_[0]
    #print(w)

    predict = np.array(res.predict(X_Test))

    # Accuracy
    AC = accuracy_score(yTest, predict, normalize=True)
    
    # Score of individual classes
    F1 = f1_score(yTest, predict, average=None)
    
    return AC

In [11]:
help(svm.SVC)

Help on class SVC in module sklearn.svm.classes:

class SVC(sklearn.svm.base.BaseSVC)
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time complexity
 |  is more than quadratic with the number of samples which makes it hard
 |  to scale to dataset with more than a couple of 10000 samples.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  other, see the corresponding section in the narrative documentation:
 |  :ref:`svm_kernels`.
 |  
 |  Read more in the :ref:`User Guide <svm_classification>`.
 |  
 |  Parameters
 |  ----------
 |  C : float, optional (default=1.0)
 |      Penalty parameter C of the error term.
 |  
 |  kernel : string, optional (default='rbf')
 |       Specifies the kernel type to be used in the algorithm.
 |       It must be one of 'linear', 'poly

### Finding the best value for gamma and weights

##### TODO: Divide the set into training/test/validation to find the model/hyperparameters

In [12]:
'''gamma = [2**-5 ,2**-4 ,2**-3 ,2**-2 ,2**-1, 2, 2**1, 2**2, 2**3, 2**4]
c = [2**-15, 2**-10, 2**-5, 2**-4, 2**-3, 2**-2, 2**-2, 2, 2**1, 2**2, 2**3]
ACTotal = []
ACMeans = []

d = []

for g_ in gamma:
    for c_ in c:
        ACMeans.append(np.mean(ACTotal))
        ACTotal = []
        for x in range(100):
            ACTotal.append(SVM('rbf', c_, g_))
'''

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-12-b3f00552794d>, line 14)

In [None]:
#np.unique(bestG)

In [None]:
#np.unique(bestC)

In [None]:
# Double Testing

In [26]:
trainData, testData = train_test_split(wineData_norm, test_size = 0.3)

yTest = testData.Alcohol.tolist()

# Gettin the predictors to have the right format in arrays
X_Test = testData.iloc[:,:-1].as_matrix()

In [34]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {
    'C':            np.arange( 1, 100+1, 1 ).tolist(),
    'kernel':       ['linear', 'rbf'],                   # precomputed,'poly', 'sigmoid'
    'degree':       np.arange( 0, 100+0, 1 ).tolist(),
    'gamma':        np.arange( 0.0, 10.0+0.0, 0.1 ).tolist(),
    'coef0':        np.arange( 0.0, 10.0+0.0, 0.1 ).tolist(),
    'shrinking':    [True],
    'probability':  [False],
    'tol':          np.arange( 0.001, 0.01+0.001, 0.001 ).tolist(),
    'cache_size':   [2000],
    'class_weight': [None],
    'verbose':      [False],
    'max_iter':     [-1],
    'random_state': [None],
    }

res = RandomizedSearchCV( n_iter = 500,
                            estimator = svm.SVC(),
                            param_distributions = parameters,
                            n_jobs              = 4,
                            iid                 = True,
                            refit               = True,
                            cv                  = 5,
                            verbose             = 1,
                            pre_dispatch        = '2*n_jobs'
                            )

res.fit(X_Test, yTest)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=4)]: Done 2500 out of 2500 | elapsed:    2.5s finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=500, n_jobs=4,
          param_distributions={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71...size': [2000], 'class_weight': [None], 'verbose': [False], 'max_iter': [-1], 'random_state': [None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)

In [41]:
res.best_estimator_

SVC(C=56, cache_size=2000, class_weight=None, coef0=9.5,
  decision_function_shape=None, degree=98, gamma=1.9000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.005, verbose=False)

In [39]:
res.best_score_

0.98148148148148151

In [40]:
res.best_params_

{'C': 56,
 'cache_size': 2000,
 'class_weight': None,
 'coef0': 9.5,
 'degree': 98,
 'gamma': 1.9000000000000001,
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.005,
 'verbose': False}

### Running SVM Itself

In [56]:
ACTotal = []
for x in range(100):
    ACTotal.append(SVM('rbf', c=56, gamma=1.9000000000000001))

In [57]:
np.mean(ACTotal)

0.96129629629629632

In [58]:
np.std(ACTotal)

0.026201576932407263