# Performs a dimensionality reduction before running SVM

## Libraries

In [1]:
%matplotlib inline
# from ipywidgets import AppLayout, TwoByTwoLayout, IntSlider, FloatSlider
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
# from sklearn.preprocessing import MinMaxScaler
# from IPython.display import clear_output
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
# import ipywidgets as widgets
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import time

from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# import umap

---
## Code

### ```make_pipe()```

Creates a 3 step Pipeline given a list containing:
-   `scaler`: Method for Preprocessing data or `None`
-   `dim_reduction`: Dimensionality Reduction Method or `None`
-   `clf`: Classifier
Returns the pipeline

In [None]:
def make_pipe(steps, verbose=0):
    
    name = ["scaler", "dim_reduction", "clf"]
    if steps[-1]==None:
        raise ValueError("A model for SVC is needed")
    if len(steps)!=3:
        raise ValueError("number of steps must be 3")
    
    return Pipeline(steps=[(name[i], steps[i]) for i in range(3)], verbose=max(0,verbose-1))

### ```clf()```

Runs a pipeline, on `data`, containing the 3 elements in the argument `steps`.\
Returns the fitted pipeline and prints the score on Test if `verbose`&ge;1.

In [None]:
@ignore_warnings(category=RuntimeWarning)
def clf(data, steps=None, verbose=True, seed=42):
#>> Extracting data
    X_train = data["train"]
    X_test = data["test"]
    y_train = data["y train"]
    y_test = data["y test"]

#>> Pipeline
    pipeline = Pipeline(steps=[
                ("scaler", StandardScaler()), 
                ("dim_reduction", PCA(n_components=30, random_state=seed)),
                ("svc", LinearSVC(random_state=seed)),
                 ],
                 verbose=verbose)
    
    if steps != None:
        pipeline = make_pipe(steps, verbose=verbose)

    pipeline.fit(X_train.T, y_train)

    if verbose:
        print(f"{pipeline.score(X_test.T,y_test):.4f}")

    return pipeline

### ```CVsearch()```

Performs Gridsearch and Cross validation to find the best parameters.\
Returns a DataFrame with informations of each iteration

In [None]:
@ignore_warnings(category=(RuntimeWarning, ConvergenceWarning))
def CVsearch(data, steps, cv_inner, param_grid=None, verbose=1):
#>> Extracting data
    X_train = data["train"]
    X_test = data["test"]
    y_train = data["y train"]
    y_test = data["y test"]
    max_dim = data["max dim"]

#>> Pipeline
    pipe = make_pipe(steps)

#>> Search and CV
    dim=(max_dim//cv_inner)*(cv_inner-1)

    if param_grid==None:
        param_grid ={
                    "dim_reduction__n_components": [dim for dim in range(100, dim, 100)],
                    "clf__C": [0.001, 0.01, 0.1, 1],
                } 
    
    clf = GridSearchCV(pipe, param_grid, cv=cv_inner, verbose=verbose-2, refit=True)
    clf.fit(X_train.T, y_train)

#>> Output
    table = pd.DataFrame(clf.cv_results_)
    i = clf.best_index_
    best = table[i:i+1]
    cv_results = pd.concat((best.set_index('rank_test_score'),table.drop(index=i).set_index('rank_test_score').sort_index()))

    if verbose:
        print(f"best parameters: {clf.best_params_}")
        print(f"best score: {clf.best_score_:.3f}")
        print(f"prediction score: {clf.score(X_test.T, y_test):.3f}")
        print(f"F1 score: {f1_score(y_test, clf.predict(X_test.T)):.3f}")
        if verbose-1:
            display(cv_results.head(4))

    return clf, cv_results


In [None]:
# @ignore_warnings(category=(RuntimeWarning, ConvergenceWarning))
# def CVsearch(X, y, steps, cv_inner, cv_outer=5, param_grid=None, verbose=1):
# #>> Pipeline
#     pipe = make_pipe(steps)

# #>> Search and CV
#     max_dim = min(X.shape)
#     dim=(max_dim//cv_inner)*(cv_inner-1)

#     if param_grid==None:
#         param_grid ={
#                     "dim_reduction__n_components": [dim for dim in range(100, dim, 100)],
#                     "clf__C": [0.001, 0.01, 0.1, 1],
#                 } 
    
#     clf = GridSearchCV(pipe, param_grid, cv=cv_inner, verbose=verbose-1, refit=True)
#     clf.fit(X.T, y)

# #>> Output
#     table = pd.DataFrame(clf.cv_results_)
#     i = clf.best_index_
#     best = table[i:i+1]
#     cv_results = pd.concat((best.set_index('rank_test_score'),table.drop(index=i).set_index('rank_test_score').sort_index()))

#     if verbose:
#         print(f"best parameters: {clf.best_params_}")
#         print(f"best score: {clf.best_score_:.3f}")
#         print(f"prediction score: {cross_val_score(clf, X=X.T, y=y, cv=cv_outer).mean():.3f}")
#         display(cv_results.head(4))

#     return cv_results
