In [None]:
import sklearn.datasets
# conda install -c conda-forge scikit-learn
from sklearn import neighbors, tree, cluster
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt 
import numpy as np
import itertools as it

In [None]:
def draw_confusion_matrix(y_test, y_pred, display_labels=None):
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=display_labels)
    disp.plot()
    plt.show()

In [None]:
def draw_scatter(data, labels, x_feature, y_feature):
    plt.scatter(data[:, x_feature], data[:, y_feature], c=labels, alpha=0.5, s=100)
    plt.xlabel(iris.feature_names[x_feature]), plt.ylabel(iris.feature_names[y_feature])
    plt.show()

## Loading example data: Iris plants dataset
https://scikit-learn.org/stable/datasets/toy_dataset.html#iris-dataset

In [None]:
iris = sklearn.datasets.load_iris()

In [None]:
type(iris)

In [None]:
iris.data.shape, iris.data[:5]

In [None]:
iris.target.shape, np.unique(iris.target, return_counts=True)

### Plot feature combinations + labels

In [None]:
for i, j in it.combinations(enumerate(iris.feature_names), 2):
    print (i,j)
    draw_scatter(iris.data, iris.target, i[0], j[0])

## ML Technique 1: Supervised Learning
https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html  
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html  

In [None]:
### Instantiate K-NN Classifier
# Note: no internal model constructed
# Class is computed from a simple majority vote of the nearest neighbors of each point
# clf = neighbors.KNeighborsClassifier(n_neighbors=1)

In [None]:
# Instantiate Decision Tree classifier
clf = tree.DecisionTreeClassifier()

In [None]:
clf

In [None]:
print ('feature names : ',iris.feature_names)
# print(iris.data)
print ('label names: ',iris.target_names)
print('labels : ',iris.target)

### Cross-validation and model evaluation
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html  
https://scikit-learn.org/stable/modules/model_evaluation.html  

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=123) 

In [None]:
# K-fold construction
kf = KFold(n_splits=5, shuffle=True, random_state=123) 

cv_scores = []
# K-fold cross validation and performance evaluation

for max_depth in [5,10,15]:
    accuracy_scores = []
    for train_index, test_index in kf.split(iris.data):
        clf = tree.DecisionTreeClassifier(max_depth=max_depth)
        # draw_scatter(X_test, x_feature=2, y_feature=3)
        print("TRAIN:", len(train_index), "TEST:", len(test_index))
        X_train, X_test = iris.data[train_index], iris.data[test_index]
        y_train, y_test = iris.target[train_index], iris.target[test_index]

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        train_pred = clf.predict(X_train)

        print ('Prediction: ', y_pred)
        print ('Correct:    ', y_test)
        print ('Accuracy:', accuracy_score(y_pred, y_test))
        print (classification_report(y_test, y_pred, target_names=iris.target_names))
        draw_confusion_matrix(y_pred,y_test,iris.target_names)
        acc = accuracy_score(y_pred,y_test)
        accuracy_scores.append(acc)
    cv_scores.append({'acc': np.mean(accuracy_scores), 'max_depth' : max_depth})
    
        

In [None]:
cv_scores

In [None]:
'{:.2f} +/- {:.3f}'.format(np.mean(accuracy_scores), np.std(accuracy_scores))

## ML Technique 2: Unsupervised Learning
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
# unsupervised k-means 
k_means = cluster.KMeans(n_clusters=2)
k_means.fit(iris.data)
k_means_label = k_means.labels_

### Plot feature combindations + clusters (k = 2)

In [None]:
for i, j in it.combinations(enumerate(iris.feature_names), 2):
    print(i,j)
    draw_scatter(iris.data, k_means_label, i[0], j[0])

In [None]:
# unsupervised k-means 
k_means = cluster.KMeans(n_clusters=4)
k_means.fit(iris.data)
k_means_label = k_means.labels_

### Plot feature combindations + clusters (k = 4)

In [None]:
for i, j in it.combinations(enumerate(iris.feature_names), 2):
    print(i,j)
    draw_scatter(iris.data, k_means_label, i[0], j[0])

### Unsupervised learning evaluation: elbow plot

In [None]:
inertias = []
for k in range(2, 15):
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(iris.data)
    inertias.append(kmeans.inertia_)

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 15), inertias)
plt.grid(True)
plt.title('Elbow curve')
plt.xticks(list(range(2, 15)))
plt.xlabel('K')
_ = plt.ylabel('Intertia')

In [None]:
# Sources:
# http://www.astro.washington.edu/users/vanderplas/Astr599/notebooks/18_IntermediateSklearn
# http://matplotlib.org/examples/pylab_examples/scatter_star_poly.html
# http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html
# http://scikit-learn.org/stable/auto_examples/plot_confusion_matrix.html#example-plot-confusion-matrix-py
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html