# 9.4 Dimensionality Reduction and Hyperparameter Tuning with Scikit-Learn

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [2]:
X = cancer.data
y = cancer.target

In [3]:
print(cancer.feature_names)
print(cancer.target_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']


In [4]:
print(X[0])
print(y[0])

[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
0


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Pipeline without PCA

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

scale_step = ('scaler', StandardScaler())
SVM_step = ('SVM', SVC(random_state=0))

# Creating the pipeline without PCA
pipeline = Pipeline([scale_step, SVM_step])

# Training the Model
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('SVM',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=0, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [7]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="accuracy")
print(scores.mean())

0.9758241758241759


## 9.4.1 Dimensionality Reduction with PCA

In [8]:
from sklearn.decomposition import PCA
pca_step = ('pca', PCA(n_components = 5))
pca_pipeline = Pipeline([scale_step, pca_step, SVM_step])
pca_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=5,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('SVM',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=0, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [9]:
scores = cross_val_score(pca_pipeline, X_train, y_train, cv=5, scoring="accuracy")
print(scores.mean())

0.9692307692307693


In [10]:
# Getting the PCA object in the pipeline
pca_obj = pca_pipeline.named_steps['pca']

In [11]:
print(pca_obj.explained_variance_ratio_)

[0.43430767 0.19740115 0.09351771 0.06677661 0.05642452]


## 9.4.2 Hyperparameter Tuning

In [12]:
from sklearn.model_selection import GridSearchCV
# Defining the parameters grid
params = [
{'pca__n_components':[5, 0.95], 'SVM__C': [0.1, 1, 2], 'SVM__kernel': ['linear', 'rbf']},
{'SVM__gamma': [0.1, 1, 2], 'SVM__kernel':['poly', 'rbf', 'sigmoid']}]

In [13]:
grid_search = GridSearchCV(pca_pipeline, params, cv = 3)

In [14]:
grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('pca',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=5, random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('SVM',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_...
        

In [15]:
print('Best Score = ', grid_search.best_score_)
print('Best Parameters = ', grid_search.best_params_)
final_model = grid_search.best_estimator_

Best Score =  0.9714767050075519
Best Parameters =  {'SVM__C': 1, 'SVM__kernel': 'linear', 'pca__n_components': 0.95}


In [16]:
from sklearn.metrics import accuracy_score
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9736842105263158