# 1. Metric Learning
--------------
## Mahalanobis Metric Learning for Clustering (MMC)

In [None]:
# Basic setting
import numpy as np
import matplotlib.pyplot as plt
import metric_learn
from metric_learn import MMC
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Generate random data with multiple clusters
n_samples = 40
random_state = 170
X, y = make_blobs(centers = 2)

# First, we obtain labels (0 or 1) for this data set
kmeans = KMeans(n_clusters = 2)
y_pred = kmeans.fit_predict(X)

# Visualization of Kmeans clustering
plt.scatter(X[:,0], X[:,1], c = y_pred)
plt.show()

## [문제] 주어진 데이터로 MMC를 학습시킨 후에, 변형시키시오.

In [None]:
# Obtain the labels:
labels = kmeans.labels_

# Find the indices corresponding to each label (0 or 1)
A = np.where(kmeans.labels_ == 1)
B = np.where(kmeans.labels_ == 0)

label_1 = A[0]
label_0 = B[0]

# Now, data corresponding to indices stored in A are SIMILAR to each other
# data corresponding to indices stored in A and B are DISSIMILAR to each other
ind_1 = np.concatenate((label_1[0:4], label_1[10:14]), axis=None)
ind_2 = np.concatenate((label_1[5:9], label_0[0:4]), axis=None)
pair = [X[ind_1], X[ind_2]]
pair = np.stack(pair, axis=1)

In [None]:
# Perform MMC transformation
### YOUR CODE HERE (Fill in the "None")
# Hint: define a MMC object of 200 iterations
mmc = None
mmc.fit(None, [1]*4+[-1]*4)

# Transform the data using the MMC we just fitted
X_transformed = None
### END OF YOUR CODE

In [None]:
# Visualize the transformed data
plt.scatter(X_transformed[:,0], X_transformed[:,1])
plt.show()

# 2. Ensemble Learning
--------------
## AdaBoostClassifier (Boosting)

In [None]:
# Basic libaray function
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split, KFold, cross_val_score
# For votingclassifier ensemble
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
# Support vector machine classifier - details in https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC
# StandardScaler for scaling the dataset
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              AdaBoostClassifier)
from sklearn.tree import DecisionTreeClassifier

In [None]:
Dataset_path = 'dataset/datasets_diabetes.csv'

In [None]:
#read in the dataset
df = pd.read_csv(Dataset_path)
#take a look at the data
df.head()

In [None]:
# X : dataset feature, y : dataset label
X = df.drop(columns = ['Outcome'])
y = df['Outcome']
# normalize dataset for training
X = StandardScaler().fit_transform(X)
#split data into train and test sets - Ratio : 4:1 (4 for training, 1 for test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)# X : dataset feature, y : dataset label
X = df.drop(columns = ['Outcome'])
y = df['Outcome']
# normalize dataset for training
X = StandardScaler().fit_transform(X)
#split data into train and test sets - Ratio : 4:1 (4 for training, 1 for test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

## [문제] 주어진 Training data를 AdaBoostClassifier로 학습하고 가장 높은 정확도를 가지는 estimator number를 출력하시오. 그리고 최적의 estimator 수를 사용한 AdaBoostClassifier의  feature importance를 그래프로 그리시오.
1. estimator의 수에 따른 prediction accuracy의 변화를 그리시오. 
2. Best estimator number와 Best prediction accuracy를 출력하시오. 
3. Best estimator number를 사용한 AdaBoostClassifier의 feature importance를 그래프로 그리시오. 

In [None]:
# Set up the estimator range
estimator = list(range(1,101))
# To find best number of estimator, initialize best_estimator and max_score, and best_model
best_estimator = -1
max_score = -1
best_model = None
# To plot the trend of score along the estomator, initialize the list of score
score_list = []
for n_estimators in estimator :
    # Create AdaBoostClassifier with n_estimators.
    # Hint: The number of estimators are configured by 'n_estimators' parameter .
    # Hint: We use 0 as seed which is given at each base_estimator at each boosting iteratio n.
    AdaBoost_clf = None
    k_folds = KFold(n_splits=20)
    results = cross_val_score(AdaBoost_clf, X_train, y_train, cv=k_folds)
    score = results.mean()
    score_list.append(score)
    if score > max_score :
        max_score = score
        best_estimator = n_estimators
        best_model = AdaBoost_clf
plt.plot(estimator, score_list)
plt.xlabel('Number of estimators')
plt.ylabel('Prediction accuracy(%)')
plt.show()

In [None]:
# print the best estimator number and the maximum score
print("Best Estimator number : {} / Best prediction accuracy : {}".format(None, None))

In [None]:
# Create AdaBoostClassifier
# Hint: the number of estimators giving the best performance is best_estimator.
AdaBoost_clf = None
AdaBoost_clf.fit(X_train, y_train)
fig, ax = plt.subplots()
x=list(df.columns[:-1])
# Plot AdaBoost classifier's feature importances
ax.plot_date(x, None, 'bo')
fig.autofmt_xdate()
plt.show()