<a href="https://colab.research.google.com/github/couqdev/MachineLearning/blob/main/Lab_5_20130376_TranDangQuoc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab is to deal with **SVM** to classification tasks and compare its performance with other competitive algorithms. In general, **SVM** is one of the most popular and widely used supervised machine learning algorithms.

*   **Deadline: 23:59, 17/03/2023**



# Import libraries

In [17]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/MachineLeaning'

Mounted at /content/gdrive
/content/gdrive/MyDrive/MachineLeaning


In [2]:
# code
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import train_test_split
from prettytable import PrettyTable
import sklearn.metrics as metrics
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from keras.datasets import mnist

import pandas as pd

#Task 1. 
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   1.1.	Apply SVM algorithm to above dataset using linear kernel.
*   1.2.	Compare the obtained results with other competitive algorithms (Logistic Regression, Decision Tree, kNN) based on metrics: accuracy, precision, recall, f1 measures.



In [None]:
cancer  = datasets.load_breast_cancer()

X = cancer.data
y = cancer.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)
# SVM
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc1=  metrics.accuracy_score(y_test, y_pred)
pre1 = metrics.precision_score(y_test, y_pred, average='macro')
rec1 = metrics.recall_score(y_test, y_pred, average='macro')
f1_1 = metrics.f1_score(y_test, y_pred, average='macro')
print("Accuracy:",acc1)
print("Precision:", pre1)
print("Recall:", rec1)
print("F1:", f1_1)

print(metrics.classification_report(y_test, y_pred))


Accuracy: 0.9532163742690059
Precision: 0.9562651331719128
Recall: 0.9431216931216931
F1: 0.9490312965722802
              precision    recall  f1-score   support

           0       0.97      0.90      0.93        63
           1       0.95      0.98      0.96       108

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171



In [None]:
# Logistic Regression

classifier = LogisticRegression(random_state = 1,solver='lbfgs',max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = metrics.confusion_matrix(y_test, y_pred)

acc2 = metrics.accuracy_score(y_test, y_pred)
pre2 = metrics.precision_score(y_test, y_pred, average='macro')
rec2 = metrics.recall_score(y_test, y_pred, average='macro')
f1_2 = metrics.f1_score(y_test, y_pred, average='macro')
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.90      0.93        63
           1       0.95      0.97      0.96       108

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171



In [None]:
# KNN
KNN = KNeighborsClassifier(28)
KNN.fit(X_train, y_train)
y_predKNN = KNN.predict(X_test)
pre3 = metrics.precision_score(y_test, y_predKNN, average='macro')
rec3 = metrics.recall_score(y_test, y_predKNN, average='macro')
f1_3 = metrics.f1_score(y_test, y_predKNN, average='macro')
acc3  = metrics.accuracy_score(y_test, y_predKNN)
print(metrics.classification_report(y_test, y_predKNN))

              precision    recall  f1-score   support

           0       0.93      0.83      0.87        63
           1       0.90      0.96      0.93       108

    accuracy                           0.91       171
   macro avg       0.92      0.89      0.90       171
weighted avg       0.91      0.91      0.91       171



In [None]:
# Decision Tree
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
pre4 = metrics.precision_score(y_test, y_pred, average='macro')
rec4 = metrics.recall_score(y_test, y_pred, average='macro')
f1_4 = metrics.f1_score(y_test, y_pred, average='macro')
acc4  = metrics.accuracy_score(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92        63
           1       0.94      0.97      0.95       108

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



In [None]:
t = PrettyTable(['','accuracy', 'precision', 'recall', 'f1'])
t.add_row(['SVM',round(acc1, 3),round(pre1,3),round(rec1, 3),round(f1_1,3)])
t.add_row(['Logistic Regression',round(acc2, 3),round(pre2,3),round(rec2, 3),round(f1_2,3)])
t.add_row(['KNN',round(acc3, 3),round(pre3,3),round(rec3, 3),round(f1_3,3)])
t.add_row(['Decision Tree',round(acc4, 3),round(pre4,3),round(rec4, 3),round(f1_4,3)])
print(t)



+---------------------+----------+-----------+--------+-------+
|                     | accuracy | precision | recall |   f1  |
+---------------------+----------+-----------+--------+-------+
|         SVM         |  0.953   |   0.956   | 0.943  | 0.949 |
| Logistic Regression |  0.947   |   0.948   | 0.938  | 0.943 |
|         KNN         |  0.912   |   0.916   | 0.894  | 0.903 |
|    Decision Tree    |  0.924   |   0.926   |  0.91  | 0.917 |
+---------------------+----------+-----------+--------+-------+


#Task 2. 

*   1.1.	Perform SVM algorithm to **Iris dataset** using **linear kernel**.
*   1.2.	Compare the obtained results in 1.1 with SVM using other kernels (**Polynomial Kernel, Gaussian Kernel, Sigmoid Kernel, Radial Basis Function Kernel**). Some metrics could be used: accuracy, precision, recall, f1 measures





In [None]:
data = datasets.load_iris()

X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)
# SVM Linear Kernel:
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc1=  metrics.accuracy_score(y_test, y_pred)
pre1 = metrics.precision_score(y_test, y_pred, average='macro')
rec1 = metrics.recall_score(y_test, y_pred, average='macro')
f1_1 = metrics.f1_score(y_test, y_pred, average='macro')
print("Accuracy:",acc1)
print("Precision:", pre1)
print("Recall:", rec1)
print("F1:", f1_1)

print(metrics.classification_report(y_test, y_pred))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [None]:
# SVM Polynomial Kernel:
clf = svm.SVC(kernel='poly')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc2=  metrics.accuracy_score(y_test, y_pred)
pre2 = metrics.precision_score(y_test, y_pred, average='macro')
rec2 = metrics.recall_score(y_test, y_pred, average='macro')
f1_2 = metrics.f1_score(y_test, y_pred, average='macro')
print("Accuracy:",acc1)
print("Precision:", pre1)
print("Recall:", rec1)
print("F1:", f1_1)

print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9777777777777777
Precision: 0.9761904761904763
Recall: 0.9814814814814815
F1: 0.9781305114638448
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        18
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [None]:
# SVM sigmoid Kernel:
clf = svm.SVC(kernel='sigmoid')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc3=  metrics.accuracy_score(y_test, y_pred)
pre3 = metrics.precision_score(y_test, y_pred, average='macro')
rec3 = metrics.recall_score(y_test, y_pred, average='macro')
f1_3 = metrics.f1_score(y_test, y_pred, average='macro')
print("Accuracy:",acc1)
print("Precision:", pre1)
print("Recall:", rec1)
print("F1:", f1_1)

print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.28888888888888886
Precision: 0.09629629629629628
Recall: 0.3333333333333333
F1: 0.14942528735632185
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.00      0.00      0.00        18
           2       0.29      1.00      0.45        13

    accuracy                           0.29        45
   macro avg       0.10      0.33      0.15        45
weighted avg       0.08      0.29      0.13        45



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# SVM RBF Kernel:
clf = svm.SVC(kernel='rbf')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc4=  metrics.accuracy_score(y_test, y_pred)
pre4 = metrics.precision_score(y_test, y_pred, average='macro')
rec4 = metrics.recall_score(y_test, y_pred, average='macro')
f1_4 = metrics.f1_score(y_test, y_pred, average='macro')
print("Accuracy:",acc1)
print("Precision:", pre1)
print("Recall:", rec1)
print("F1:", f1_1)

print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9777777777777777
Precision: 0.9761904761904763
Recall: 0.9814814814814815
F1: 0.9781305114638448
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        18
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



In [None]:
t = PrettyTable(['','accuracy', 'precision', 'recall', 'f1'])
t.add_row(['linear',round(acc1, 3),round(pre1,3),round(rec1, 3),round(f1_1,3)])
t.add_row(['Poly',round(acc2, 3),round(pre2,3),round(rec2, 3),round(f1_2,3)])
t.add_row(['sigmoid',round(acc3, 3),round(pre3,3),round(rec3, 3),round(f1_3,3)])
t.add_row(['rbf',round(acc4, 3),round(pre4,3),round(rec4, 3),round(f1_4,3)])
print(t)


+---------+----------+-----------+--------+-------+
|         | accuracy | precision | recall |   f1  |
+---------+----------+-----------+--------+-------+
|  linear |  0.978   |   0.976   | 0.981  | 0.978 |
|   Poly  |  0.978   |   0.976   | 0.981  | 0.978 |
| sigmoid |  0.978   |   0.976   | 0.981  | 0.978 |
|   rbf   |  0.956   |   0.956   | 0.956  | 0.956 |
+---------+----------+-----------+--------+-------+


#Task 3. 
Compare the performance of selected classification algorithms (Decision Tree, kNN, Logistic Regression) and SVM (using different kernels) with mnist dataset based on accuracy, precision, recall, f1 measures.


In [11]:
data_mnist = datasets.load_digits()
X = data_mnist.data
y = data_mnist.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)
clf = svm.SVC(kernel='poly')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc1 = metrics.accuracy_score(y_test, y_pred)
precision1 = metrics.precision_score(y_test, y_pred, average='micro')
recall1 = metrics.recall_score(y_test, y_pred, average='micro')
f1_1 = metrics.f1_score(y_test, y_pred, average='micro')


In [12]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
acc_2 = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_2 = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_2 = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_2 = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)



In [13]:
KNN = KNeighborsClassifier(n_neighbors = 29)
KNN.fit(X_train, y_train)
y_pred = KNN.predict(X_test)

acc_3 = round(metrics.accuracy_score(y_test, y_pred),4)
precision_3 = round(metrics.precision_score(y_test, y_pred, average = 'micro'),4)
f1_3 = round(metrics.f1_score(y_test, y_pred, average = 'micro'),4)
recall_3 = round(metrics.recall_score(y_test, y_pred, average = 'micro'),4)



In [14]:
classifier = LogisticRegression(random_state = 0, max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc_4 = round((metrics.accuracy_score(y_test, y_pred)),4)
precision_4 = round((metrics.precision_score(y_test, y_pred, average = 'macro')),4)
f1_4 = round((metrics.f1_score(y_test, y_pred, average = 'macro')),4)
recall_4 = round((metrics.recall_score(y_test, y_pred, average = 'macro')),4)



In [15]:
t = PrettyTable(['','acc','precision','recall','f1'])
t.add_row(['SMV',acc1,precision1,recall1,f1_1])
t.add_row(['Decision Tree',acc_2,precision_2,recall_2,f1_2])
t.add_row(['kNN',acc_3,precision_3,recall_3,f1_3])
t.add_row(['Logistic Regression',acc_4,precision_4,recall_4,f1_4])

print(t)

+---------------------+--------------------+--------------------+--------------------+--------------------+
|                     |        acc         |     precision      |       recall       |         f1         |
+---------------------+--------------------+--------------------+--------------------+--------------------+
|         SMV         | 0.9851851851851852 | 0.9851851851851852 | 0.9851851851851852 | 0.9851851851851852 |
|    Decision Tree    |       0.8574       |        0.86        |       0.8613       |       0.8587       |
|         kNN         |       0.9667       |       0.9667       |       0.9667       |       0.9667       |
| Logistic Regression |       0.9685       |       0.9673       |       0.9681       |       0.9674       |
+---------------------+--------------------+--------------------+--------------------+--------------------+


#Task 4. 
Compare the performance of selected classification algorithms (Decision Tree, kNN, Logistic Regression) and SVM (using different kernels) with **credit card dataset** based on accuracy, precision, recall, f1 measures.

*   Give some comments on the obtained results
*   Identify issues with dataset, and propose the solutions to these issues



In [18]:
# code
dataset1 = pd.read_csv("creditcard.csv")
dataset1.columns
X = dataset1.head(5000)[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']]
y = dataset1.head(5000)[['Class']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=3)

In [20]:
# SVM Linear Kernel:
clf = svm.SVC(kernel='poly')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc1=  metrics.accuracy_score(y_test, y_pred)
pre1 = metrics.precision_score(y_test, y_pred, average='macro')
rec1 = metrics.recall_score(y_test, y_pred, average='macro')
f1_1 = metrics.f1_score(y_test, y_pred, average='macro')
print("Accuracy:",acc1)
print("Precision:", pre1)
print("Recall:", rec1)
print("F1:", f1_1)

print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9993333333333333
Precision: 0.49966666666666665
Recall: 0.5
F1: 0.49983327775925307
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1499
           1       0.00      0.00      0.00         1

    accuracy                           1.00      1500
   macro avg       0.50      0.50      0.50      1500
weighted avg       1.00      1.00      1.00      1500



  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Logistic Regression

classifier = LogisticRegression(random_state = 1,solver='lbfgs',max_iter=10000)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = metrics.confusion_matrix(y_test, y_pred)

acc2 = metrics.accuracy_score(y_test, y_pred)
pre2 = metrics.precision_score(y_test, y_pred, average='macro')
rec2 = metrics.recall_score(y_test, y_pred, average='macro')
f1_2 = metrics.f1_score(y_test, y_pred, average='macro')
print(metrics.classification_report(y_test, y_pred))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1499
           1       0.00      0.00      0.00         1

    accuracy                           1.00      1500
   macro avg       0.50      0.50      0.50      1500
weighted avg       1.00      1.00      1.00      1500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# KNN
KNN = KNeighborsClassifier(28)
KNN.fit(X_train, y_train)
y_predKNN = KNN.predict(X_test)
pre3 = metrics.precision_score(y_test, y_predKNN, average='macro')
rec3 = metrics.recall_score(y_test, y_predKNN, average='macro')
f1_3 = metrics.f1_score(y_test, y_predKNN, average='macro')
acc3  = metrics.accuracy_score(y_test, y_predKNN)
print(metrics.classification_report(y_test, y_predKNN))

  return self._fit(X, y)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1499
           1       0.00      0.00      0.00         1

    accuracy                           1.00      1500
   macro avg       0.50      0.50      0.50      1500
weighted avg       1.00      1.00      1.00      1500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Decision Tree
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
pre4 = metrics.precision_score(y_test, y_pred, average='macro')
rec4 = metrics.recall_score(y_test, y_pred, average='macro')
f1_4 = metrics.f1_score(y_test, y_pred, average='macro')
acc4  = metrics.accuracy_score(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1499
           1       0.00      0.00      0.00         1

    accuracy                           1.00      1500
   macro avg       0.50      0.50      0.50      1500
weighted avg       1.00      1.00      1.00      1500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
t = PrettyTable(['','accuracy', 'precision', 'recall', 'f1'])
t.add_row(['SVM',round(acc1, 4),round(pre1,4),round(rec1, 3),round(f1_1,3)])
t.add_row(['Logistic Regression',round(acc2, 3),round(pre2,3),round(rec2, 3),round(f1_2,3)])
t.add_row(['KNN',round(acc3, 3),round(pre3,3),round(rec3, 3),round(f1_3,3)])
t.add_row(['Decision Tree',round(acc4, 3),round(pre4,3),round(rec4, 3),round(f1_4,3)])
print(t)


+---------------------+----------+-----------+--------+-----+
|                     | accuracy | precision | recall |  f1 |
+---------------------+----------+-----------+--------+-----+
|         SVM         |  0.9993  |   0.4997  |  0.5   | 0.5 |
| Logistic Regression |  0.999   |    0.5    |  0.5   | 0.5 |
|         KNN         |  0.999   |    0.5    |  0.5   | 0.5 |
|    Decision Tree    |  0.999   |    0.5    |  0.5   | 0.5 |
+---------------------+----------+-----------+--------+-----+


#Finally,
Save a copy in your Github. Remember renaming the notebook.