## Dimensionality Reduction and SVM

### Libraries

In [1]:
from importnb import imports
from sklearn.decomposition import PCA, KernelPCA
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import PowerTransformer, StandardScaler, MaxAbsScaler, QuantileTransformer
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

Random State and n_jobs:

In [2]:
seed = 235
from joblib import parallel_backend
parallel_backend("threading", n_jobs=-1)

<joblib.parallel.parallel_backend at 0x26dbe0dda50>

---
### Imports

In [3]:
with imports("ipynb"):
    import data
    import classifier

---
#### Data Processing

Splitting Raw data into Labelled Test and Train dataset using [`process_raw()`](data.ipynb)

In [4]:
# data.process_raw(section="SmartS", seed=seed)
# data.process_raw(section="DropSeq", seed=seed)

Select which `file=["MCF7","HCC1806"]` and `section=["SmartS","DropSeq"]` we want to consider

In [5]:
file = "MCF7"
section = "SmartS"

Saves the datasets into `pandas.DataFrame` along with their true labels

In [6]:
dataset = data.data_split(file=file, section=section)
X_train = dataset["train"]
X_test = dataset["test"]
y_train = dataset["y train"]
y_test = dataset["y test"]
max_dim = dataset["max dim"]

---
#### Classifier

We use [`clf()`](classifier.ipynb) to performs a Pipeline containing `scaler`, `reduction` and `svc` 
( accepts `scaler`=`None`, `reduction`=`None` )

In [7]:
steps = [None, PCA(n_components=30, random_state=seed), LinearSVC(random_state=seed)]

classifier.clf(dataset, steps=steps, seed=seed)

1.0000


---
#### Tuning

We tune the Hyperparameters in `param_grid` by running a Gridsearch with Cross Validation using [`CVsearch()`](classifier.ipynb).

##### SmartS

For MCF in SmartS it is easy to obtain perfect scores by simply taking a linear kernel, even if sigmoid works just as well.\
We keep C=1 since decreasing it too much would negatively affect the performance.

In [8]:
dataset = data.data_split(file="MCF7", section="SmartS")
max_dim = dataset["max dim"]

steps=[StandardScaler(),PCA(random_state=seed),SVC(random_state=seed)]
fold = 10
dim=(max_dim//fold)*(fold-1)
param_grid ={
            "dim_reduction__n_components": [i for i in range(2,11)],
            "clf__C": [1,2,3],
            "clf__kernel": ["linear"]
        }
print("> MCF | SmartS:")
clf_A, table_A = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

> MCF | SmartS:
best parameters: {'clf__C': 1, 'clf__kernel': 'linear', 'dim_reduction__n_components': 3}
best score: 1.000
prediction score: 1.000
F1 score: 1.000


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_dim_reduction__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.120606,0.212078,0.240439,0.109362,1,linear,3,"{'clf__C': 1, 'clf__kernel': 'linear', 'dim_re...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1.386165,0.074486,0.178743,0.033641,2,linear,6,"{'clf__C': 2, 'clf__kernel': 'linear', 'dim_re...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1.354344,0.129831,0.249717,0.035366,3,linear,6,"{'clf__C': 3, 'clf__kernel': 'linear', 'dim_re...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,1.432917,0.093413,0.184086,0.069025,3,linear,4,"{'clf__C': 3, 'clf__kernel': 'linear', 'dim_re...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [12]:
dataset = data.data_split(file="HCC1806", section="SmartS")
max_dim = dataset["max dim"]

steps=[MaxAbsScaler(),KernelPCA(random_state=seed),SVC(random_state=seed)]
fold = 10
dim=(max_dim//fold)*(fold-1)
param_grid ={
            "dim_reduction__n_components": [i for i in range(73,80)],
            "dim_reduction__kernel": ["sigmoid", "cosine", "rbf", "linear", "poly"],
            "dim_reduction__coef0": [1],
            "clf__coef0": [0.17, 0.18],
            "clf__C": [0.1,1,2,3],
            "clf__kernel": ["sigmoid", "rbf", "linear", "poly"]
        }
print("> HCC | SmartS:")
clf_B, table_B = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

> HCC | SmartS:
best parameters: {'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel': 'sigmoid', 'dim_reduction__coef0': 1, 'dim_reduction__kernel': 'cosine', 'dim_reduction__n_components': 79}
best score: 0.986
prediction score: 0.973
F1 score: 0.971


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__coef0,param_clf__kernel,param_dim_reduction__coef0,param_dim_reduction__kernel,param_dim_reduction__n_components,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.40425,0.417612,1.141176,0.306751,1,0.18,sigmoid,1,cosine,79,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
2,2.314444,0.375531,1.20622,0.283099,1,0.18,sigmoid,1,cosine,76,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.315923,0.382806,1.136377,0.317503,2,0.18,linear,1,linear,76,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.375897,0.304973,1.209875,0.239515,2,0.18,linear,1,linear,75,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302


In [27]:
display(table_B.head(10))

Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__coef0,param_clf__kernel,param_dim_reduction__coef0,param_dim_reduction__kernel,param_dim_reduction__n_components,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.40425,0.417612,1.141176,0.306751,1,0.18,sigmoid,1,cosine,79,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
2,2.314444,0.375531,1.20622,0.283099,1,0.18,sigmoid,1,cosine,76,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.315923,0.382806,1.136377,0.317503,2,0.18,linear,1,linear,76,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.375897,0.304973,1.209875,0.239515,2,0.18,linear,1,linear,75,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.485885,0.285953,1.084605,0.289934,2,0.18,linear,1,linear,74,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.47013,0.409435,1.068174,0.321673,2,0.18,linear,1,cosine,79,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.247811,0.271345,1.132223,0.266879,2,0.18,linear,1,cosine,78,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.4981,0.4983,1.168026,0.232881,2,0.17,linear,1,linear,76,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.25799,0.305141,1.00187,0.233771,2,0.18,linear,1,cosine,77,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302
2,2.401966,0.329213,1.19499,0.197488,2,0.17,linear,1,linear,75,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302


In [17]:
param_grid ={
            "dim_reduction__n_components": [i for i in range(5,dim, 10)],
            "dim_reduction__kernel": ["cosine"],
            # "dim_reduction__coef0": [0.5,1,2],
            "clf__coef0": [0.18,0.5,1],
            "clf__C": [0.1,1,2],
            "clf__kernel": ["sigmoid"]
        }
print("> HCC | SmartS:")
clf_X, table_X = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

> HCC | SmartS:
best parameters: {'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel': 'sigmoid', 'dim_reduction__coef0': 0.5, 'dim_reduction__kernel': 'cosine', 'dim_reduction__n_components': 125}
best score: 0.986
prediction score: 1.000
F1 score: 1.000


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__coef0,param_clf__kernel,param_dim_reduction__coef0,param_dim_reduction__kernel,param_dim_reduction__n_components,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.963536,0.119878,0.586501,0.19416,1,0.18,sigmoid,0.5,cosine,125,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
1,1.006933,0.120066,0.605266,0.098909,1,0.18,sigmoid,2.0,cosine,125,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
1,0.960026,0.132393,0.581392,0.129409,1,0.18,sigmoid,1.0,cosine,125,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
4,1.015137,0.133481,0.512658,0.096383,2,0.5,sigmoid,2.0,cosine,75,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.979524,0.031302


In [22]:
param_grid ={
            "dim_reduction__n_components": [i for i in range(120, dim+1)],
            "dim_reduction__kernel": ["cosine"],
            "dim_reduction__coef0": [1],
            "clf__coef0": [0.18],
            "clf__C": [0.5,1,2],
            "clf__kernel": ["sigmoid"]
        }
print("> HCC | SmartS:")
clf_X, table_X = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

> HCC | SmartS:
best parameters: {'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel': 'sigmoid', 'dim_reduction__coef0': 1, 'dim_reduction__kernel': 'cosine', 'dim_reduction__n_components': 125}
best score: 0.986
prediction score: 1.000
F1 score: 1.000


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__coef0,param_clf__kernel,param_dim_reduction__coef0,param_dim_reduction__kernel,param_dim_reduction__n_components,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.250011,0.099463,0.601241,0.179735,1,0.18,sigmoid,1,cosine,125,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
1,1.200794,0.166884,0.690499,0.158181,1,0.18,sigmoid,1,cosine,126,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,1.0,0.98619,0.02764
3,1.456683,0.219284,0.688588,0.129679,1,0.18,sigmoid,1,cosine,122,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,0.928571,0.979048,0.032029
3,1.304959,0.23215,0.539916,0.169218,1,0.18,sigmoid,1,cosine,120,...,1.0,1.0,1.0,1.0,0.928571,1.0,1.0,0.928571,0.979048,0.032029


In [10]:
dataset = data.data_split(file="MCF7", section="DropSeq")
max_dim = dataset["max dim"]

steps=[MaxAbsScaler(),KernelPCA(random_state=seed),SVC(random_state=seed)]
fold = 10
dim=(max_dim//fold)*(fold-1)
param_grid ={
            "dim_reduction__n_components": [700],
            "dim_reduction__kernel": ["cosine"],
            "clf__C": [2],
            "clf__kernel": ["rbf"]
        }
print("> MCF | DropSeq:")
clf_C, table_C = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

> MCF | DropSeq:
best parameters: {'clf__C': 2, 'clf__kernel': 'rbf', 'dim_reduction__kernel': 'cosine', 'dim_reduction__n_components': 700}
best score: 0.979
prediction score: 0.983
F1 score: 0.986


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_dim_reduction__kernel,param_dim_reduction__n_components,params,split0_test_score,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2194.926734,8.222752,20.195446,3.984447,2,rbf,cosine,700,"{'clf__C': 2, 'clf__kernel': 'rbf', 'dim_reduc...",0.972254,...,0.982081,0.978035,0.982081,0.976879,0.983237,0.984393,0.974566,0.976301,0.979191,0.003912


In [11]:
dataset = data.data_split(file="HCC1806", section="DropSeq")
max_dim = dataset["max dim"]

steps=[MaxAbsScaler(),KernelPCA(random_state=seed),SVC(random_state=seed)]
fold = 10
dim=(max_dim//fold)*(fold-1)
param_grid ={
            "dim_reduction__n_components": [510],
            "dim_reduction__kernel": ["sigmoid"],
            "clf__C": [2],
            "clf__kernel": ["rbf"]
        }
print("> HCC | DropSeq:")
clf_D, table_D = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

> HCC | DropSeq:
best parameters: {'clf__C': 2, 'clf__kernel': 'rbf', 'dim_reduction__kernel': 'sigmoid', 'dim_reduction__n_components': 510}
best score: 0.959
prediction score: 0.966
F1 score: 0.956


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__kernel,param_dim_reduction__kernel,param_dim_reduction__n_components,params,split0_test_score,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,609.191376,2.760273,13.729789,5.683921,2,rbf,sigmoid,510,"{'clf__C': 2, 'clf__kernel': 'rbf', 'dim_reduc...",0.954043,...,0.954894,0.96,0.95234,0.965928,0.963373,0.967632,0.949744,0.954855,0.958707,0.005984


In [51]:
dataset = data.data_split(file="HCC1806", section="DropSeq")
max_dim = dataset["max dim"]

steps=[MaxAbsScaler(),KernelPCA(random_state=seed),SVC(random_state=seed)]
fold = 4
dim=(max_dim//fold)*(fold-1)
print(dim)
param_grid ={
            "dim_reduction__n_components": [100, dim, 100],
            "dim_reduction__kernel": ["cosine"],
            "clf__coef0": [0.18],
            "clf__C": [1],
            "clf__kernel": ["sigmoid"]
        }
print("> HCC | DropSeq:")
clf_C, table_C = classifier.CVsearch(dataset, steps, cv_inner=fold, param_grid=param_grid, verbose=2)

2250
> HCC | DropSeq:
best parameters: {'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel': 'sigmoid', 'dim_reduction__kernel': 'cosine', 'dim_reduction__n_components': 2250}
best score: 0.949
prediction score: 0.951
F1 score: 0.937


Unnamed: 0_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__coef0,param_clf__kernel,param_dim_reduction__kernel,param_dim_reduction__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,555.05804,45.048807,169.563334,78.304528,1,0.18,sigmoid,cosine,2250,"{'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel...",0.947225,0.948229,0.955381,0.945845,0.94917,0.003685
2,423.513968,108.69318,264.244712,12.449766,1,0.18,sigmoid,cosine,100,"{'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel...",0.873681,0.883174,0.892371,0.882834,0.883015,0.006609
2,485.105706,0.794678,207.858453,76.558243,1,0.18,sigmoid,cosine,100,"{'clf__C': 1, 'clf__coef0': 0.18, 'clf__kernel...",0.873681,0.883174,0.892371,0.882834,0.883015,0.006609
