In [1]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import datasets

### Getting and exploring data

The purpose is to use chemical analysis determine the origin of wines

In [2]:
wine = datasets.load_wine()

In [3]:
print("Features: ", wine.feature_names)

Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [4]:
print("Labels: ", wine.target_names)

Labels:  ['class_0' 'class_1' 'class_2']


Convert dataframe 

In [5]:
data= pd.DataFrame(data= np.c_[wine['data'], wine['target']],
                 columns= wine['feature_names'] + ['target'])

Exploring the dataset

In [6]:
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0


In [7]:
data.tail()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740.0,2.0
174,13.4,3.91,2.48,23.0,102.0,1.8,0.75,0.43,1.41,7.3,0.7,1.56,750.0,2.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0,2.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0,2.0
177,14.13,4.1,2.74,24.5,96.0,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560.0,2.0


In [8]:
data.columns.values.tolist()

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline',
 'target']

### Setting train and test sets

In [9]:
X = data.drop('target', axis=1)  
y = data['target']

Split and shuffle

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)  

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(133, 13)
(45, 13)
(133,)
(45,)


In [12]:
y_train.head()

2      0.0
100    1.0
122    1.0
154    2.0
51     0.0
Name: target, dtype: float64

### Linear Kernel

With C = 1 -> High tolerance

In [13]:
svclassifier = SVC(C=1.0,kernel='linear')  
svclassifier.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

Testing model

In [14]:
y_pred = svclassifier.predict(X_test)  

The following confusion matrix shows that Class_1 and Class_3 were classified 100% correct, however Class_2 case 17 out of 18 were classified correctly. This means that 1 out of 45 were misclassified. That's a good performance!

NOTE: Probably the overfitting is due to the short amount of samples.

In [15]:
print(confusion_matrix(y_test,y_pred)) 

[[15  0  0]
 [ 0 17  1]
 [ 0  0 12]]


In [16]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9777777777777777


With C = 10 -> Low tolerance

In [17]:
svclassifier = SVC(C=10.0,kernel='linear')  
svclassifier.fit(X_train, y_train) 

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

Testing model

In [18]:
y_pred = svclassifier.predict(X_test)  

The following confusion matrix shows that all classes were classified correctly.

In [19]:
print(confusion_matrix(y_test,y_pred)) 

[[15  0  0]
 [ 0 18  0]
 [ 0  0 12]]


In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


### Polynomial Kernel

In [21]:
svclassifierP = SVC(kernel='poly', degree=3, gamma="auto")  
svclassifierP.fit(X_train, y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [22]:
y_predP = svclassifierP.predict(X_test)  

In [23]:
print(confusion_matrix(y_test, y_predP)) 

[[15  0  0]
 [ 0 18  0]
 [ 0  0 12]]


In [24]:
print("Accuracy:",metrics.accuracy_score(y_test, y_predP))

Accuracy: 1.0


### Gaussian Kernel

In [25]:
svclassifierG = SVC(C=10.0,kernel='rbf', gamma="auto")  
svclassifierG.fit(X_train, y_train)  

SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
y_predG = svclassifierG.predict(X_test)  

In [27]:
print(confusion_matrix(y_test, y_predG)) 

[[ 3 12  0]
 [ 1 17  0]
 [ 0 11  1]]


In [28]:
print("Accuracy:",metrics.accuracy_score(y_test, y_predG))

Accuracy: 0.4666666666666667


### References

https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python

https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/

