In [1]:
%matplotlib inline 

import pandas as pd
import numpy as np
from sklearn import cluster, metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot') 

## 1. Format the Data

In [2]:
ap = pd.read_csv('../../assets/datasets/airport2.csv')
ap.head()

Unnamed: 0,Airport,Year,Departure Cancellations,Arrival Cancellations,Departure Diversions,Arrival Diversions
0,ABQ,2004,242,235,71,46
1,ABQ,2005,221,190,61,33
2,ABQ,2006,392,329,71,124
3,ABQ,2007,366,304,107,45
4,ABQ,2008,333,300,79,42


Convert the class labels, in this case the airports, to numeric. *Remember*, we've truncated the airport dataset for simplicty to make this part easy! 

In [None]:
def air_to_numeric(x):
    if x=='ABQ':
        return 1
    if x=='ANC':
        return 2
    if x=='ATL':
        return 3

In [None]:
ap['Airport'] = ap['Airport'].apply(air_to_numeric)

## 2. Plot the data

In [None]:
ap.plot(kind='scatter',x='Departure Cancellations',y='Arrival Cancellations')

In [None]:
ap.plot(kind='scatter',x='Departure Cancellations',y='Departure Diversions')

**Question**: Based on the graphs, how many clusters do you estimate that we have?

## 3.  Perform a K-Means Clustering 

#### 3.1 Split the dataset into the attribute data and class labels

In [None]:
x = ap.ix[:,1:5]
y = ap.ix[:,0]

In [None]:
X = x.as_matrix(columns=None)

#### 3.2 Conduct the k-means clustering

In [None]:
k = 3
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(X)

#### 3.3 Compute the labels and centroids

In [None]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
print(labels)
print(centroids)

#### 3.4 Format the resulting predicted "y" variables and check the accuracy score, classification report, and confusion matrix

In [None]:
metrics.accuracy_score(y, labels)

In [None]:
print(metrics.classification_report(y, labels))

In [None]:
print(metrics.confusion_matrix(y, labels))

**Question**: What are we understand from this these performance metrics? 

## 4. Extra Trees Classifier

In [None]:
trees = ExtraTreesClassifier()
trees.fit(X, labels)

In [None]:
print(trees.feature_importances_)

## 5. KNN Classifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X, labels)

In [None]:
exp = labels
predicted = knn.predict(x)

In [None]:
print(metrics.classification_report(exp, predicted))
print(metrics.confusion_matrix(exp, predicted))

## 6. Hierarchical Clustering

#### 1. Create the linkage for clustering

In [None]:
Z = linkage(X, 'ward')

In [None]:
c, coph_dists = cophenet(Z, pdist(X))
c

In [None]:
plt.figure(figsize=(30, 10))
plt.title('Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    leaf_rotation=90.,  
    leaf_font_size=8.,
)
plt.show()

In [None]:
plt.title('Truncated Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    truncate_mode='lastp',  
    p=15,  
    show_leaf_counts=False,  
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,  
)
plt.show()

In [None]:
max_d = 30
clusters = fcluster(Z, max_d, criterion='distance')
clusters

In [None]:
plt.scatter(X[:,0], X[:,2], c=clusters, cmap='prism')
plt.show()

## Extra Trees Classifier

In [None]:
trees2 = ExtraTreesClassifier()
trees2.fit(X, clusters)

In [None]:
print(trees2.feature_importances_)

## KNN Classifier

In [None]:
knn2 = KNeighborsClassifier()
knn2.fit(X, clusters)

In [None]:
exp2 = clusters
predicted2 = knn2.predict(X)

In [None]:
print(metrics.classification_report(exp, predicted))
print(metrics.confusion_matrix(exp, predicted))