# [9660] Support Vector Machine 1
Data file:
* sklearn : breast_cancer

In [None]:
from datetime import datetime
print(f'Run time: {datetime.now().strftime("%D %T")}')

Run time: 11/03/24 14:18:03


### Import libraries

In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.metrics import accuracy_score

### Set global variable

In [None]:
global RANDOM_STATE
RANDOM_STATE=42

### Load data

In [None]:
bc = load_breast_cancer()
df = pd.DataFrame(data=bc['data'], columns=bc['feature_names'])
df['target'] = bc['target']

### Examine data

In [None]:
df.shape

(569, 31)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
# Review distribution of target values
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,357
0,212


### Separate independent and dependent variables

In [None]:
X = df.drop('target', axis=1)     # Independent variables
y = df['target']                  # Dependent variable

### Scale the features
Since SVM is very sensitive to features with different ranges, we need to scale the features  
Standardize the feature values around 0 with a standard deviation of 1

In [None]:
# Instantiate StandardScaler
sc = StandardScaler()

# Standardize the independent variables
X_scaled = sc.fit_transform(X)

### Split data into training and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y,
                                                    test_size=0.25,
                                                    random_state=RANDOM_STATE)

### Train various SVC models
* SVC uses libsvm and its fit time scales with training set size
* LinearSVC uses liblinear and it scales better with larger training set sizes
* NuSVC is similar to SVC but uses hyperparameter "nu" to control the # of support vectors

In [None]:
classifiers = [
    SVC(C=0.5,class_weight='balanced',  kernel='poly', probability=True,
        break_ties=True, random_state=RANDOM_STATE),
    LinearSVC(dual=False, C=0.5, multi_class='ovr', class_weight='balanced',
              random_state=RANDOM_STATE),
    NuSVC(nu=0.5, kernel='sigmoid', probability=True, class_weight='balanced',
          break_ties=True, random_state=RANDOM_STATE)
    ]

for classifier in classifiers:
    print()
    print(classifier)
    %time model = classifier.fit(X_train, y_train)
    model_preds = model.predict(X_test)
    model_accuracy = accuracy_score(y_test, model_preds)
    print(f"Model score: {round((model_accuracy * 100), 2)}%")


SVC(C=0.5, break_ties=True, class_weight='balanced', kernel='poly',
    probability=True, random_state=42)
CPU times: user 45.2 ms, sys: 294 µs, total: 45.5 ms
Wall time: 108 ms
Model score: 92.31%

LinearSVC(C=0.5, class_weight='balanced', dual=False, random_state=42)
CPU times: user 6.54 ms, sys: 721 µs, total: 7.26 ms
Wall time: 12.2 ms
Model score: 96.5%

NuSVC(break_ties=True, class_weight='balanced', kernel='sigmoid',
      probability=True, random_state=42)
CPU times: user 96.5 ms, sys: 1.62 ms, total: 98.1 ms
Wall time: 263 ms
Model score: 94.41%
