# Data preprocessing

## MinMaxScaler

Usage: MinMaxScaler(feature_range=(min, max))
### 
Transformation:

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

X_scaled = X_std * (max - min) + min

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler

# iris: 鳶尾屬

iris = datasets.load_iris()

X = iris.data
y = iris.target

In [None]:
# first 10 rows

X[:10]

In [None]:
y[:10]

In [None]:
iris.feature_names   # sepal: 花萼, petal: 花瓣

In [None]:
iris.target_names

### 

In [None]:
scaler = MinMaxScaler()  # default: to (0, 1)

In [None]:
scaler.fit(X)

In [None]:
X_transformed = scaler.transform(X)

In [None]:
X_transformed.min(axis=0)

In [None]:
X_transformed.max(axis=0)

Combining fit and transform in one line

In [None]:
X_transform = scaler.fit_transform(X)

Fit one part of the data, transform another part of the data

In [None]:
X_a = X[:100]
X_b = X[100:]

_ = scaler.fit_transform(X_a)
# scaler.fit(X_a)

X_b_transformed = scaler.transform(X_b)

In [None]:
X_b_transformed.min(axis=0)

In [None]:
X_b_transformed.max(axis=0)

inverse_transform: 逆變換

In [None]:
scaler.inverse_transform(X_b_transformed)[:10]

In [None]:
# if X_b and scaler.inverse_transform(X_b_transformed) are identical

X_b[:10]

## StandardScaler

Usage: StandardScaler(with_mean=True, with_std=True)
### 
Transformation:

z = (x - u) / s

where u is the mean of the training samples or zero if with_mean=False, and s is the standard deviation of the training samples or one if with_std=False.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# How to fit and transform on iris dataset?

## Dimensionality reduction: 

1. Principal component analysis
2. Singular value decomposition

作用: 

    1. 避免過擬合(overfitting)和加快訓練過程。
    
    2. 畫圖。

### PCA

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://upload.wikimedia.org/wikipedia/commons/f/f5/GaussianScatterPCA.svg")

PCA 搭配 iris 實際操作

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  # n_components must be smaller than the dimensionality of data

pca.fit(X)  # sklearn PCA automatically substract the mean

In [None]:
pca.explained_variance_  # variance along the new axis

In [None]:
pca.explained_variance_ratio_  # one component may be sufficient

In [None]:
# how to apply this on different part of the data

from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

###

iris data set is too small, let us use a bigger one: mnist

784 features (28 * 28)

In [None]:
Image(url= "https://camo.githubusercontent.com/01c057a753e92a9bc70b8c45d62b295431851c09cffadf53106fc0aea7e2843f/687474703a2f2f692e7974696d672e636f6d2f76692f3051493378675875422d512f687164656661756c742e6a7067")

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')  

In [None]:
data = mnist.data

In [None]:
data.head()

In [None]:
data.count()

In [None]:
pca = PCA(n_components=50)

pca.fit(data)

In [None]:
data_pca = pca.fit_transform(data)

In [None]:
data_pca.shape  # new feature dimensionality

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_

In [None]:
import numpy as np

np.cumsum(pca.explained_variance_ratio_)

In [None]:
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

### 操作練習

### SVD

In [None]:
Image(url='https://www.oreilly.com/library/view/mastering-numerical-computing/9781788993357/assets/7afa6d97-21e5-40bf-8138-43a5c61c5f69.png')

SVD + iris 實際操作

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2)

svd.fit(X)

In [None]:
svd.explained_variance_

In [None]:
svd.explained_variance_ratio_

You can also do this on mnist...

## Machine learning

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

iris = datasets.load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# recall_score(y_test, y_pred, average='macro')
# recall_score(y_test, y_pred, average='micro')

Apply PCA on machine learning

In [None]:
pca = PCA(n_components=2)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

lr.fit(X_train_pca, y_train)

y_pred = lr.predict(X_test_pca)

confusion_matrix(y_test, y_pred)

In [None]:
# But we only used 20% of the data as unseen data, maybe we are just lucky

from sklearn.model_selection import KFold, StratifiedKFold

'''
KFold: KFold divides all the samples in groups of samples, called folds (if , this is equivalent to the Leave One Out strategy), 
of equal sizes (if possible). The prediction function is learned using folds, and the fold left out is used for test.

StratifiedKFold: StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately 
the same percentage of samples of each target class as the complete set.
'''

kf = KFold(n_splits=5, shuffle=True)

for train_index, test_index in kf.split(X):
    print(train_index)

In [None]:
y_true = y
y_pred = np.zeros_like(y)

for train_index, test_index in kf.split(X):
    X_train = X[train_index]
    y_train = y[train_index]
    
    X_train_pca = pca.fit_transform(X_train)
    
    lr.fit(X_train_pca, y_train)
    
    # update y_pred
    X_test_pca = pca.transform(X[test_index])
    y_pred[test_index] = lr.predict(X_test_pca)

In [None]:
confusion_matrix(y_true, y_pred)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True)

y_true = y
y_pred = np.zeros_like(y)

for train_index, test_index in skf.split(X, y):
    X_train = X[train_index]
    y_train = y[train_index]
    
    X_train_pca = pca.fit_transform(X_train)
    
    lr.fit(X_train_pca, y_train)
    
    # update y_pred
    X_test_pca = pca.transform(X[test_index])
    y_pred[test_index] = lr.predict(X_test_pca)
    
confusion_matrix(y_true, y_pred)

### 

model_selection: what is the connection between model_selection and kfold?

Usually a machine learning algorithm involves multiple hyperparameters (can be a lot), here is an example from logistic regression:

class sklearn.linear_model.LogisticRegression(penalty='l2', * , dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

Also, the n_components in SVD and PCA shown above is also a hyperparameter

We can use kfold to select the best combination of hyperparameters.

### 
iris is too simple, let us use mnist as our example

In [None]:
from sklearn.datasets import fetch_openml
# mnist = fetch_openml('mnist_784') 

X = mnist.data.values
y = mnist.target.values

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
from sklearn.ensemble import RandomForestClassifier

acc_max = 0
best_params = {}

list_of_n_components = [50, 100, 150, 200, 250, 300, 350, 400]
list_of_max_depth = [3, 7, 9, 11]

rf = RandomForestClassifier(random_state=0, n_jobs=-1)

for n_components in list_of_n_components:
    
    pca = PCA(n_components=n_components)
    
    for max_depth in list_of_max_depth:
        
        skf = StratifiedKFold(n_splits=5, shuffle=True)

        y_true = y
        y_pred = np.zeros_like(y)
        
        rf.set_params(**{'max_depth': max_depth})
        
        for train_index, test_index in skf.split(X, y):
            X_train = X[train_index]
            y_train = y[train_index]

            X_train_pca = pca.fit_transform(X_train)

            lr.fit(X_train_pca, y_train)

            # update y_pred
            X_test_pca = pca.transform(X[test_index])
            y_pred[test_index] = lr.predict(X_test_pca)

        acc = accuracy_score(y_true, y_pred)
        
        if acc > acc_max:
            print(f"accuracy = {acc}, max_depth = {max_depth}, n_components = {n_components}")
            acc_max = acc
            best_params['accuracy'] = acc
            best_params['max_depth'] = max_depth
            best_params['n_components'] = n_components