# [9660] Support Vector Machine 2
Data file:
* https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/Skin_NonSkin.txt

In [None]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 11/03/24 14:18:32


### Import libraries

In [None]:
import pprint as pp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Load data

Dataset: UCI Skin Segmentation Data Set: http://archive.ics.uci.edu/ml/datasets/Skin+Segmentation#

The skin dataset is collected by randomly sampling B,G,R values from face images of various age groups (young, middle, and old), race groups (white, black, and asian), and genders obtained from FERET database and PAL database.  
Total learning sample size is 245057; out of which 50859 is the skin samples and 194198 is non-skin samples.

Color FERET Image Database: [Web Link], PAL Face Database from Productive Aging Laboratory, The University of Texas at Dallas: [Web Link].

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/vjavaly/Baruch-CIS-9660/main/data/Skin_NonSkin.txt',
                 header=None, names=['B', 'G', 'R', 'Skin'], sep='\t')

### Examine data

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 245057 entries, 0 to 245056
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   B       245057 non-null  int64
 1   G       245057 non-null  int64
 2   R       245057 non-null  int64
 3   Skin    245057 non-null  int64
dtypes: int64(4)
memory usage: 7.5 MB


In [None]:
df.shape

(245057, 4)

In [None]:
df.sample(n=10)

Unnamed: 0,B,G,R,Skin
131072,121,121,75,2
113399,175,173,125,2
27014,120,145,209,1
35892,167,182,231,1
106385,2,0,0,2
110654,173,170,125,2
85059,101,102,46,2
170963,236,135,221,2
143963,51,52,18,2
191545,255,250,187,2


In [None]:
# Review distribution of target values
#  1=skin, 2=non-skin
df['Skin'].value_counts()

Unnamed: 0_level_0,count
Skin,Unnamed: 1_level_1
2,194198
1,50859


### Separate independent and dependent variables

In [None]:
X = df.drop('Skin', axis=1)     # Independent variables
y = df['Skin']                  # Dependent variable

### Scale the features
Since SVM is very sensitive to features with different ranges, we need to scale the features  
Standardize the feature values around 0 with a standard deviation of 1

In [None]:
# Instantiate StandardScaler
sc = StandardScaler()

# Standardize the independent variables
X = sc.fit_transform(X)

### Split data into training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    test_size=0.3,
                                                    random_state=42)

### Train model with default hyperparameters

In [None]:
# Instantiate SVC classifier with default radial basis function (rbf) kernel
classifier = SVC()

In [None]:
pp.pprint(classifier.get_params())

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}


In [None]:
%%time

# Fit the model
classifier.fit(X_train, y_train)

CPU times: user 17 s, sys: 455 ms, total: 17.5 s
Wall time: 28.1 s


### Evaluate model

In [None]:
%%time

model_preds = classifier.predict(X_test)
model_accuracy = accuracy_score(y_test, model_preds)
print(f"SVC (with default RBF kernel) score: {round((model_accuracy * 100), 3)}%")

SVC (with default RBF kernel) score: 99.835%
CPU times: user 9.02 s, sys: 16.9 ms, total: 9.04 s
Wall time: 9.09 s


### Train model with the linear kernel

In [None]:
# Instantiate SVC classifier with the linear kernel
classifier = SVC(kernel='linear')

In [None]:
%%time

# Fit the model
classifier.fit(X_train, y_train)

CPU times: user 4min 50s, sys: 476 ms, total: 4min 51s
Wall time: 4min 52s


### Evaluate updated model

In [None]:
%%time

model_preds = classifier.predict(X_test)
model_accuracy = accuracy_score(y_test, model_preds)
print(f"SVC (with linear kernel) score: {round((model_accuracy * 100), 3)}%")

SVC (with linear kernel) score: 92.859%
CPU times: user 52.4 s, sys: 45.4 ms, total: 52.5 s
Wall time: 52.6 s


### Train model with the polynomial kernel

In [None]:
# Instantiate SVC classifier with the polynomial (poly) kernel
classifier = SVC(kernel='poly')

In [None]:
%%time

# Fit the model
classifier.fit(X_train, y_train)

CPU times: user 9min 41s, sys: 601 ms, total: 9min 41s
Wall time: 9min 50s


### Evaluate updated model

In [None]:
%%time

model_preds = classifier.predict(X_test)
model_accuracy = accuracy_score(y_test, model_preds)
print(f"SVC (with polynomial kernel) score: {round((model_accuracy * 100), 3)}%")

SVC (with polynomial kernel) score: 94.921%
CPU times: user 50.4 s, sys: 49 ms, total: 50.5 s
Wall time: 50.6 s
