In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris, load_wine, fetch_california_housing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC, SVC, SVR
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, mean_absolute_error, root_mean_squared_error

# LinearSVC, SVC, SGDClassifier on Iris dataset

- Train a LinearSVC on a linearly separable dataset.
- Then train an SVC and a SGDClassifier on the same dataset to see if i can get them to produce roughly the same model.

In [None]:
iris = load_iris(as_frame=True)
iris.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [None]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [None]:
# X containing Petal length and width
X = iris.data[['petal length (cm)', 'petal width (cm)']]

# y containing True or False labels for Setosa
y = (iris.target == 0)

In [None]:
# Pipeline for scaling and creating LinearSVC model
linsvm_clf = make_pipeline(
    StandardScaler(),
    LinearSVC(random_state=42)
)

linsvm_clf.fit(X, y)

In [None]:
# Pipeline for scaling and creating LinearSVC model
svm_clf = make_pipeline(
    StandardScaler(),
    SVC(random_state=42)
)

svm_clf.fit(X, y)

In [None]:
# Pipeline for scaling and creating LinearSVC model
sgd_clf = make_pipeline(
    StandardScaler(),
    SGDClassifier(random_state=42)
)

sgd_clf.fit(X, y)

In [None]:
linsvm_clf_pred = linsvm_clf.predict(X)
svm_clf_pred = svm_clf.predict(X)
sgd_clf_pred = sgd_clf.predict(X)

print(f'LinearSVC accuracy: {round(accuracy_score(y, linsvm_clf_pred), 2)}')
print(f'SVC accuracy: {round(accuracy_score(y, svm_clf_pred), 2)}')
print(f'SGDClassifier accuracy: {round(accuracy_score(y, sgd_clf_pred), 2)}')

LinearSVC accuracy: 1.0
SVC accuracy: 1.0
SGDClassifier accuracy: 1.0


All models performed great, it means all three of them are capable of setting correct decision boundary for linear dataset. (Iris dataset with binary output in this case)

# SVM Classifier on Wine dataset

Train an SVM classifier on the wine dataset, which you can load using
sklearn.datasets.load_wine(). This dataset contains the chemical
analyses of 178 wine samples produced by 3 different cultivators: the
goal is to train a classification model capable of predicting the cultivator
based on the wine’s chemical analysis. Since SVM classifiers are binary
classifiers, you will need to use one-versus-all to classify all three
classes. What accuracy can you reach?

In [None]:
wine = load_wine(as_frame=True)
X = wine.data
y = wine.target

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [None]:
y.unique()

array([0, 1, 2])

In [None]:
# Splitting dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Creating model and training it
svm_clf = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', decision_function_shape='ovr',random_state=42) # One-versus-Rest
)

svm_clf.fit(X_train, y_train)

In [None]:
# Using model to predict y_test
y_pred = svm_clf.predict(X_test)

print(f'Accuracy: {round(accuracy_score(y_test, y_pred), 2)}')

Accuracy: 1.0


Model shows perfect score on test dataset.

SVM classifiers are binary, but inputting several classes will automatically make it one-v-all classifier or we can explicitly specify it using 'decision_function_shape = ovr'.

# SVM Regression fine-tuned on California Housing dataset

Train and fine-tune an SVM regressor on the California housing dataset.
You can use the original dataset, which you can load using
sklearn.datasets.fetch_california_housing(). The targets represent
hundreds of thousands of dollars. Since there are over 20,000 instances,
SVMs can be slow, so for hyperparameter tuning you should use far
fewer instances (e.g., 2,000) to test many more hyperparameter
combinations. What is your best model’s RMSE?


In [3]:
housing = fetch_california_housing(as_frame=True)

X = housing.data
y = housing.target

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [5]:
print(y.head())

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: MedHouseVal, dtype: float64


In [6]:
# Leaving only 2000 instances, so we can fine-tune SVMRegressor faster
X_short = X[:2000]
y_short = y[:2000]

# Creating object for Standard Scaling and manually scaling X
scaler = StandardScaler()
X_short_scaled = scaler.fit_transform(X_short)

In [11]:
svr = SVR()

# Fine-tuning hyperparameters for SVR
param_grid = {
    'kernel': ('poly', 'rbf'),
    'gamma': ('auto', 'scale'),
    'epsilon': (0.01, 0.1, 1),
    'C': (0.1, 1, 10)
}

grid = GridSearchCV(svr, param_grid, scoring='neg_root_mean_squared_error', cv=5)
grid.fit(X_short_scaled, y_short)

In [12]:
print(f'Best parameters: {grid.best_params_}, \nBest RMSE: {grid.best_score_}')

Best parameters: {'C': 1, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}, 
Best RMSE: -0.48856349386080655


In [13]:
# Splitting dataset to train and test on all instances
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.05, random_state=42)

In [15]:
# Recreating the best model
best_svr = SVR(C=1, epsilon=0.1, gamma='scale', kernel='rbf')
best_svr.fit(X_train, y_train)

In [16]:
# Predicting using the best model
pred = best_svr.predict(X_test)

print(f'MAE: {round(mean_absolute_error(y_test, pred), 2)}')
print(f'RMSE: {round(root_mean_squared_error(y_test, pred), 2)}')

MAE: 0.4
RMSE: 0.59


Mean absolute error is around 40.000 U.S. dollars, while root mean squared error is about 60,000 U.S. dollars which is reasonable for this type of dataset and thus can be considered acceptable.