In [None]:
import mglearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.decomposition import PCA
from sklearn.datasets import load_breast_cancer, load_digits, load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_validate
from sklearn.neural_network import MLPClassifier

In [None]:
# 1. Supervised_Learning_1_Basics
# 1. Supervised_Learning_2_k_Nearest_Neighbors
# 1. Supervised_Learning_3_Linear_Models
# 1. Supervised_Learning_4_Decision_Trees_Ensembles

In [None]:
# 1. Supervised_Learning_5_Support_Vector_Machines

########## SVR

X, y = mglearn.datasets.load_extended_boston()]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scalerX = StandardScaler()
scalerX.fit(X_train)
X_train_scaled = scalerX.transform(X_train)
X_test_scaled = scalerX.transform(X_test) # Same scaling for the test datset (Exploring the test dataset is prohibited.)

# Slack variable in SVR are effected by the target value. Ergo, the target needs to be also scaled.
# (Prof. 강석호) 회귀 문제를 위한 예측모델 학습 시(예를 들어, Decision Tree와 k-NN은 target label의 scaling에 invariant하여 제외) target label의 scaling이 권장됩니다.

scalerY = StandardScaler()
scalerY.fit(y_train.reshape(-1,1))
y_train_scaled = scalerY.transform(y_train.reshape(-1,1))
y_test_scaled = scalerY.transform(y_test.reshape(-1,1))

reg = SVR()
reg.fit(X_train_scaled, y_train_scaled)

# For Evaluation, target value should be inverse transformed.
y_train_hat_scaled = reg.predict(X_train_scaled)
y_train_hat = scalerY.inverse_transform(y_train_hat_scaled.reshape(-1,1))

print(mean_absolute_error(y_train, y_train_hat))
print(mean_squared_error(y_train, y_train_hat) ** 0.5) # Root for RMSE
print(r2_score(y_train, y_train_hat))


y_test_hat_scaled = reg.predict(X_test_scaled)
y_test_hat = scalerY.inverse_transform(y_test_hat_scaled.reshape(-1,1))

In [None]:
# 1. Supervised_Learning_6_Neural_Networks

# Like SVR MLPRegressor needs to scale y values

In [None]:
# 2. Unsupervised_Learning_1_Basics

In [None]:
# 2. Unsupervised_Learning_2_PCA

########## PCA in supervised learning(w\ KNN)

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state= 42)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=2) # without n_components no dimension reduction is occured
pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

clf = KNeighborsClassifier(n_neighbors= 3)
clf.fit(X_train_pca, y_train)

y_train_hat = clf.predict(X_train_pca)
y_test_hat = clf.predict(X_test_pca)

# PCA have inverse transformation so it can get new data and evaluate it
X_test_rec = pca.inverse_transform(X_test_pca) # 여기서 X_test_scaled 아니야!

In [None]:
#2. Unsupervised_Learning_3_tSNE

# Unlike PCA, t-SNE does not support transforming new(test) data

digits = load_digits()

tsne = TSNE(random_state=42)
digits_tsne = tsne.fit_transform(digits.data) # use fit_transform instead of fit, as t-SNE has no transform method

In [None]:
# 2. Unsupervised_Learning_4_kMeans_HC (중요도 낮음)

# K-means는 새로운 데이터 접근가능!

iris = load_iris()
X_train, y_train = iris.data, iris.target

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled) # 새로운 데이터에 하려면 fit만

kmeans = KMeans(n_clusters=3)
kmeans.fit(X_train_scaled)

print(scaler.inverse_transform(kmeans.cluster_centers_))

assignments_X_train_scaled = kmeans.labels_

assignments_X_new = kmeans.predict(X_new)


# Hierarchical CLustering의 대표적인 Agglomerative Clustering은 새로운 데이터 접근 불가능! (no predict method)
agg = AgglomerativeClustering(n_clusters=3, linkage= "ward")
agg.fit(X_train)

assignments_X_train = agg.labels_

In [None]:
# 2. Unsupervised_Learning_5_DBC
scaler = MinMaxScaler((-1,1))
X_train_scaled = scaler.fit_transform(X_train)

dbscan = DBSCAN()
dbscan.fit(X_train_scaled)

assignments_X_train_scaled = dbscan.labels_

In [None]:
# 3. Representing_Data_and_Engineering_Features

In [None]:
# 4. Model_Evaluation_and_Improvement (여기부터 가장 중요!)

# Stratified k-Fold Cross-Validation with data scaling
iris = load_iris()
scaler = StandardScaler()
kfold = StratifiedKFold(n_splits=5, shuffle= True, random_state=1) # random state fixed

score_train = []
score_test = []

for train_idx, test_idx in kfold.split(iris.data, iris.target):
    X_train = iris.data[train_idx]
    y_train = iris.target[train_idx]
    X_test = iris.data[test_idx]
    y_test = iris.data[test_idx]
    
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = MLPClassifier(max_iter= 1000, random_state= 0)
    clf.fit(X_train_scaled, y_train)
    
    y_train_hat = clf.predict(X_train_scaled)
    y_test_hat = clf.predict(X_test_scaled)

In [None]:
# Grid Search with validation set

iris = load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, test_size= 0.25, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, test_size= 0.25, random_state=1) # 여기서 iris.data 아니라 위에서 분리한 train+val dataset!!

scaler = StandardScaler()
scaler.fit(X_train) # 처음엔 hyperparameter 설정을 위해 validation과 비교할 것이기에 scale은 X_train!!
X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid) # Valid가 여기서 test data 같은 느낌이기에 X_valid또한 X_train scaler로 scaling!

best_score = 0

for gamma in [0.001, 0.01, 1, 10, 100]:
    for C in [0.001, 0.01, 1, 10, 100]:
        clf = SVC(gamma = gamma, C = C)
        clf.fit(X_train_scaled, y_train)
        
        y_valid_hat = clf.predict(X_valid_scaled) # 여기에 X_test 집어넣으면 안돼!!
        score = accuracy_score(y_valid, y_valid_hat)
        
        if score > best_score:
            best_score = score
            best_hyperparameters = {'C': C, "gamma": gamma}
            
scaler.fit(X_trainval) # 이젠 validation 역할 다했어! train으로 merge
X_train_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

clf = SVC(**best_best_hyperparameters)
clf.fit(X_trainval, y_trainval)

y_test_hat = clf.predict(X_test_scaled)
test_score = accuracy_score(y_test, y_test_hat)

In [None]:
# Grid Search with Cross-Validation(with data scaling without pipelines) Wrong way to do

iris = load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, test_size= 0.25, random_state=0)

scaler = StandardScaler()
scaler.fit(X_trainval) # 사실 train만 해야하는데...
X_trainval_scaled = scaler.transform(X_trainval)
X_test_scaled = scaler.transform(X_test)

kfold = StratifiedKFold(n_splits= 5, shuffle= True, random_state= 2)
hyperparam_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV( SVC(), hyperparam_grid, scoring= "accuracy", refit=True, cv=kfold)
grid_search.fit(X_train_scaled, y_train_scaled)

In [None]:
# Nested Cross-Validation

iris = load_iris()
hyperparam_grid = [{"kernel":["rbf"], 
                    "C": [0.001, 0.01, 0.1, 1, 10, 100],
                    "gamma": [0.001, 0.01, 0.1, 1, 10, 100]},
                   {"kernel": ["linear"],
                    "C":[0.001, 0.01, 0.1, 1, 10, 100]}]
inner_kfold = StratifiedKFold(n_splits= 5, shuffle= True, random_state=2)
outer_kfold = StratifiedKFold(n_splits= 5, shuffle= True, random_state=2)
grid_search = GridSearchCV(SVC(), hyperparam_grid, scoring= "accuracy", refit= True, cv= inner_kfold) # inner_kfold
scores = cross_validate(grid_search, iris.data, iris.target, scoring= "accuracy", cv=outer_kfold, return_estimator= True, return_train_score= True)

In [None]:
# Nested Cross-validation with data scaling (위 문제 해결 x 하려면 gridsearchcv 사용 x)
iris = load_iris()
hyperparam_grid = [{"kernel":["rbf"], 
                    "C": [0.001, 0.01, 0.1, 1, 10, 100],
                    "gamma": [0.001, 0.01, 0.1, 1, 10, 100]},
                   {"kernel": ["linear"],
                    "C":[0.001, 0.01, 0.1, 1, 10, 100]}]
inner_kfold = StratifiedKFold(n_splits= 5, shuffle= True, random_state=2)
outer_kfold = StratifiedKFold(n_splits= 5, shuffle= True, random_state=2)

scaler = StandardScaler()
score_test = []

for trainval_idx, test_idx in outer_kfold.split(iris.data, iris.target):
    X_trainval = iris.data[trainval_idx]
    y_trainval = iris.target[trainval_idx]
    
    X_test = iris.data[test_idx]
    y_test = iris.target[test_idx]
    
    scaler.fit(X_trainval)
    X_trainval_scaled = scaler.transform[X_trainval]
    X_test_scaled = scaler.transform(X_test)
    
    grid_search = GridSearchCV(SVC(), hyperparam_grid, scoring= "accuracy", refit= True, cv =inner_kfold)
    grid_search.fit(X_trainval, y_trainval)
    
    y_test_hat = grid_search.predict(X_test_scaled)    

In [None]:
# Grid Search with Cross-Validation(with data scaling using pipelines)
iris =load_iris()
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, test_size= 0.25, random_state=0)

from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", StandardScaler()), ("svm", SVC())])

kfold = StratifiedKFold(n_splits=5, shuffle= True, random_state=2)
hyperparam_grid = {"svm__C": [0.001, 0.01, 0.1, 1, 10, 100],
                   "svm_gamma": [0.001, 0.01, 0.1, 1, 10, 100]} # step name + __ (double) + hyperparameter name
grid_search = GridSearchCV(pipe, hyperparam_grid, scoring="accuracy", refit= True, cv=kfold)
grid_search.fit(X_trainval, y_trainval)

format(grid_search.best_score_, grid_search.best_params_)

