# Classifier: Support Vector Machine (SVM)

In [1]:
# Import Python Libraries
import os
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn import datasets
from sklearn.svm import SVC

### Load & Inspect Cleaned Data

In [2]:
# Read cleaned data into pandas & create dataframe
df = pd.read_csv(os.path.join(".", "Cleaned_Data", "default.csv"))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,Latitude,Longitude,Country,Region,Sub_Region
0,7.161286,7.835325,2.911583,0.984049,-1.499546,-2.094097,0.576,-1.205671,1.849122,-0.425598,...,-0.174878,-1.089543,-0.66884,-0.914772,-0.83625,-15.75,-47.95,Brazil,South America,South America
1,0.225763,-0.094169,-0.603646,0.497745,0.874036,0.29028,-0.077659,-0.887385,0.432062,-0.093963,...,-0.157189,0.380951,1.088478,-0.123595,1.391141,14.91,-23.51,Cabo Verde,Africa,Western Africa
2,-0.692525,-0.517801,-0.788035,1.214351,-0.907214,0.880213,0.406899,-0.694895,-0.901869,-1.701574,...,2.718442,0.972919,2.081069,1.375763,1.063847,12.65,-8.0,Mali,Africa,Western Africa
3,-0.735562,-0.684055,2.058215,0.716328,-0.011393,0.805396,1.497982,0.114752,0.692847,0.052377,...,-1.020687,-0.75138,-0.385005,-0.012326,-0.392197,9.03,38.74,Ethiopia,Africa,Eastern Africa
4,0.570272,0.273157,-0.279214,0.083456,1.049331,-0.869295,-0.265858,-0.401676,-0.872639,1.147483,...,-0.190488,0.306974,0.119658,0.271838,1.289783,34.03,-6.85,Morocco,Africa,Northern Africa


In [3]:
# Determine data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 73 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   0           1059 non-null   float64
 1   1           1059 non-null   float64
 2   2           1059 non-null   float64
 3   3           1059 non-null   float64
 4   4           1059 non-null   float64
 5   5           1059 non-null   float64
 6   6           1059 non-null   float64
 7   7           1059 non-null   float64
 8   8           1059 non-null   float64
 9   9           1059 non-null   float64
 10  10          1059 non-null   float64
 11  11          1059 non-null   float64
 12  12          1059 non-null   float64
 13  13          1059 non-null   float64
 14  14          1059 non-null   float64
 15  15          1059 non-null   float64
 16  16          1059 non-null   float64
 17  17          1059 non-null   float64
 18  18          1059 non-null   float64
 19  19          1059 non-null  

In [4]:
# Drop all columns except inputs and desired output column
dropped_column_list = ['Latitude', 'Longitude', 'Country', 'Region']
df.drop(dropped_column_list, axis=1, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,Sub_Region
0,7.161286,7.835325,2.911583,0.984049,-1.499546,-2.094097,0.576,-1.205671,1.849122,-0.425598,...,-0.04361,-1.504263,0.351267,-1.018726,-0.174878,-1.089543,-0.66884,-0.914772,-0.83625,South America
1,0.225763,-0.094169,-0.603646,0.497745,0.874036,0.29028,-0.077659,-0.887385,0.432062,-0.093963,...,-0.947933,-0.495712,-0.465077,-0.157861,-0.157189,0.380951,1.088478,-0.123595,1.391141,Western Africa
2,-0.692525,-0.517801,-0.788035,1.214351,-0.907214,0.880213,0.406899,-0.694895,-0.901869,-1.701574,...,-0.556109,-0.637167,0.14726,0.217914,2.718442,0.972919,2.081069,1.375763,1.063847,Western Africa
3,-0.735562,-0.684055,2.058215,0.716328,-0.011393,0.805396,1.497982,0.114752,0.692847,0.052377,...,0.166616,-0.178325,-0.065059,-0.724247,-1.020687,-0.75138,-0.385005,-0.012326,-0.392197,Eastern Africa
4,0.570272,0.273157,-0.279214,0.083456,1.049331,-0.869295,-0.265858,-0.401676,-0.872639,1.147483,...,-0.500785,-0.919463,-0.667912,-0.820172,-0.190488,0.306974,0.119658,0.271838,1.289783,Northern Africa


In [5]:
# Count number of unique entries in target column (this will be our y variable)
df['Sub_Region'].nunique()

15

In [6]:
# Determine number of entries per unique entry in target column
df['Sub_Region'].value_counts()

Southern Asia                139
Western Africa               124
Southern Europe              118
Northern Africa               99
Western Asia                  93
South-eastern Asia            91
Eastern Asia                  84
Eastern Africa                82
Northern Europe               65
Central Asia                  62
South America                 36
Caribbean                     22
Eastern Europe                19
Australia and New Zealand     14
Central America               11
Name: Sub_Region, dtype: int64

### Define X and y

In [7]:
# Reformat data
data = df.values
X = data[:, 0:115]

y = df['Sub_Region']
y.value_counts()

Southern Asia                139
Western Africa               124
Southern Europe              118
Northern Africa               99
Western Asia                  93
South-eastern Asia            91
Eastern Asia                  84
Eastern Africa                82
Northern Europe               65
Central Asia                  62
South America                 36
Caribbean                     22
Eastern Europe                19
Australia and New Zealand     14
Central America               11
Name: Sub_Region, dtype: int64

### Label Encode Data

In [8]:
# Label-encode data set
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
print(encoded_y)

[ 9 13 13 ... 12  8  8]


### Create Training and Testing Sets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# NOTE: Random state ensures that the splits that we generate are reproducible.Scikit-learn uses random permutations to 
# generate the splits.The random state that you provide is used as a seed to the random number generator. This ensures
# that the random numbers are generated in the same order.

### Inspect the Shape of the Data

In [10]:
print('X_train Shape:', X_train.shape)
print('y_train Shape:', y_train.shape)
print('X_test Shape:', X_test.shape)
print('y_test Shape:', y_test.shape)

X_train Shape: (794, 69)
y_train Shape: (794,)
X_test Shape: (265, 69)
y_test Shape: (265,)


### Begin Support Vector Machine (SVM) linear classifier

In [11]:
# SVM or Support Vector Machine is a linear model for classification and regression problems. It can solve linear and 
# non-linear problems and work well for many practical problems. The idea of SVM is simple: The algorithm creates a line
# or a hyperplane which separates the data into classes.

In [12]:
# Import SVM
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'Southern Asia'

In [None]:
# Model Accuracy
print('Test Acc: %.3f' % model.score(X_test, y_test))

In [None]:
# HOW TO INTERPRET CLASSIFICATION REPORT

# PRECISION
# Precision is a measure of a classifier’s exactness. It is the fraction of predicted positives events that are actually
# positive. For each class, it is defined as the ratio of true positives to the sum of true & false positives. Said 
# another way, “for all instances classified positive, what percent was correct?

# RECALL
# Recall (also known as sensitivity) is the fraction of positives events that you predicted correctly.
# Recall is a measure of the classifier’s completeness; the ability of a classifier to correctly find all positive 
# instances. For each class, it is defined as the ratio of true positives to the sum of true positives & false 
# negatives. Said another way, “for all instances that were actually positive, what percent was classified correctly?”

# F1 SCORE
# The f1 score is the harmonic mean of recall and precision, with a higher score as a better model.
# The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0.
# Generally speaking, F1 scores are lower than accuracy measures as they embed precision and recall into their 
# computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global 
# accuracy.

# SUPPORT
# Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training
# data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for 
# stratified sampling or rebalancing. Support doesn’t change between models, instead it diagnoses the evaluation process.

# ACCURACY
# The most common metric for classification is Accuracy, which is the fraction of samples predicted correctly.  
# But Accuracy is not always the best metric to use to assess classification models. 

# MACRO AVERAGE
# Macro Average takes the function to compute f1 for each label, and returns the average without considering the 
# proportion for each label in the dataset. 

# WEIGHTED AVERAGE
# Weighted Average takes the function to compute f1 for each label, and returns the average considering the proportion
# for each label in the dataset.

In [None]:
# Calculate & print classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

In [None]:
import seaborn as sns
from sklearn.metrics import classification_report

true = np.random.randint(0, 15, size=100)
pred = np.random.randint(0, 15, size=100)
labels = np.arange(15)
target_names = list("ABCDEFGHIJKLMNO")

clf_report = classification_report(true,
                                   pred,
                                   labels=labels,
                                   target_names=target_names,
                                   output_dict=True)

# .iloc[:-1, :] to exclude support
sns.heatmap(pd.DataFrame(clf_report).iloc[:-1, :].T, annot=True)
visualizer.show(outpath='./images/Chromatc Sub-Region - Parallel Coordinates for 115 Features.png')

In [None]:
from yellowbrick.features import ParallelCoordinates

visualizer = ParallelCoordinates()
visualizer.fit_transform(X, y)
visualizer.show()
visualizer.show(outpath='./images/Chromatc Sub-Region - Parallel Coordinates for 115 Features.png')

In [None]:
from sklearn import svm

In [None]:
# Initialize SVM classifier
clf = svm.SVC(kernel='linear')

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
# Get support vectors
support_vectors = clf.support_vectors_

# Visualize support vectors
plt.scatter(X_train[:,0], X_train[:,1])
plt.scatter(support_vectors[:,0], support_vectors[:,1], color='red')
plt.title('Linearly separable data with support vectors')
plt.xlabel('X1')
plt.ylabel('X2')
plt.savefig('./images/Chromatc Sub-Region - Linearly Separable Data with Support Vectors.png')
plt.show()

# Prediction Error Plot

In [None]:
# A prediction error plot shows the actual targets from the dataset against the predicted values generated by our model.
# This allows us to see how much variance is in the model. We can diagnose the regression models using this 
# plot by comparing against the 45 degree line, where the prediction exactly matches the model.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import PredictionError

# Load a regression dataset
X, y = load_concrete()

# Create the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the linear model and visualizer
model = Lasso()
visualizer = PredictionError(model)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()                 # Finalize and render the figure
visualizer.show(outpath='./images/Chromatc Sub-Region - Prediction Error.png')

# Threshold Plot for Logistic Regression

In [None]:
# Linear regression is used for predicting the continuous dependent variable using a given set of independent features
# whereas Logistic Regression is used to predict the categorical. Linear regression is used to solve regression problems
# whereas logistic regression is used to solve classification problems.

In [None]:
from sklearn.linear_model import LogisticRegression

from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.datasets import load_spam

# Load a binary classification dataset
X, y = load_spam()

# Instantiate the classification model and visualizer
model = LogisticRegression(multi_class="auto", solver="liblinear")
visualizer = DiscriminationThreshold(model)

visualizer.fit(X, y)        # Fit the data to the visualizer
visualizer.show()           # Finalize and render the figure
visualizer.show(outpath='./images/Chromatc Sub-Region - Threshold Plot for LogisticRegression.png')

In [None]:
from sklearn import metrics
from neupy import algorithms
from sklearn.base import BaseEstimator
from yellowbrick.datasets import load_occupancy
from yellowbrick.classifier import ClassificationReport
from sklearn.model_selection import train_test_split


class PNNWrapper(algorithms.PNN, BaseEstimator):
    """
    The PNN wrapper implements BaseEstimator and allows the classification
    report to score the model and understand the learned classes.
    """

    @property
    def classes_(self):
        return self.classes

    def score(self, X_test, y_test):
        y_hat = self.predict(X_test)
        return metrics.accuracy_score(y_test, y_hat)


# Load the binary classification dataset 
X, y = load_occupancy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create and train the PNN model using the sklearn wrapper
model = PNNWrapper(std=0.1, verbose=True, batch_size=1059)
model.train(X_train, y_train)

# Create the classification report
viz = ClassificationReport(
    model, 
    support=True, 
    classes=["not occupied", "occupied"], 
    is_fitted=True, 
    force_model=True, 
    title="PNN"
)

# Score the report and show it
viz.score(X_test, y_test)
viz.show()
viz.show(outpath='./images/Chromatc Sub-Region - PNN Wrapper.png')