In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import sklearn.metrics as metrics
import sklearn.cluster as cluster

import seaborn as sbn
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics.cluster import pair_confusion_matrix


print(__doc__)

In [2]:
# # #############################################################################
# Download the data, if not already on disk and load it as numpy arrays

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.2)

# introspect the images arrays to find the shapes (for plotting)
n_samples, h, w = lfw_people.images.shape

# for machine learning we use the 2 data directly (as relative pixel
# positions info is ignored by this model)
X = lfw_people.data
n_features = X.shape[1]

# the label to predict is the id of the person
y = lfw_people.target
target_names = lfw_people.target_names
n_classes = target_names.shape[0]

print("Total dataset size:")
print("Sample size: %d" % n_samples, h, w)
print("Features: %d" % n_features)
print("Total Labels: %d" % n_classes)
print("")
print("Target Name and no of sample images:")
for i in range(len(lfw_people.target_names)):   
    print("{} has {} samples".format(lfw_people.target_names[i], (y == i).sum()))

# #############################################################################
# Plotting the images of the persons from the dataset
fig, ax = plt.subplots(4, 4)
plt.subplots_adjust(wspace=0.8, hspace=0.5)

for i, axi in enumerate(ax.flat):
    axi.imshow(lfw_people.images[i], cmap='gray')
    axi.set(xticks=[], yticks=[], xlabel=lfw_people.target_names[lfw_people.target[i]])
plt.show()

# #############################################################################
# Scaling the data between 0 and 1

X=X/255

In [3]:
df=pd.DataFrame(X)
df['target'] = y
df

In [4]:
#  Instantiate Kmeans

kmeans = KMeans(n_clusters = 7, init = "k-means++", random_state = 10)
y_kmeans = kmeans.fit(X)
clus=kmeans.cluster_centers_
y_km=kmeans.fit_predict(X)
y_km
plt.scatter(X[:, 0], X[:,2], c=y_km, cmap='rainbow')
plt.scatter(clus[:,0], clus[:,1], marker='*', s=200, color='black')

In [5]:
df["Cluster"] = kmeans.labels_
df

In [6]:
ariel_sharon = df[(df["target"] == 0)]
colin_powell=df[(df["target"] == 1)]
donald_rumsfeld=df[(df["target"] == 2)]
george_bush=df[(df["target"] == 3)]
gerhard_schroeder=df[(df["target"] == 4)]
hugo_chavez=df[(df["target"] == 5)]
tony_blair = df[(df["target"] == 6)]

count_of_ariel_sharon=ariel_sharon['Cluster'].value_counts()
count_of_colin_powell=colin_powell['Cluster'].value_counts()
count_of_donald_rumsfeld=donald_rumsfeld['Cluster'].value_counts()
count_of_george_bush=george_bush['Cluster'].value_counts()
count_of_hugo_chavez=hugo_chavez['Cluster'].value_counts()
count_of_tony_blair=tony_blair['Cluster'].value_counts()


print("Ariel Sharon:\nClust Count\n",count_of_ariel_sharon)
print("Colin Powell:\nClust Count\n",count_of_colin_powell)
print("Donald Rumsfeld:\nClust Count\n",count_of_donald_rumsfeld)
print("George Bush:\nClust Count\n", count_of_george_bush)
print("Hugo_Chavez:\nClust Count\n", count_of_hugo_chavez)
print("Tony Blair:\nClust Count\n", count_of_tony_blair)


## Confusion Matrix 
clus_con_matrix = confusion_matrix(y,y_km)

%matplotlib inline
plt.figure(figsize=(10,10))
plt.title('Confusion Matrix for CNN')
sbn.heatmap(clus_con_matrix, cmap="OrRd", annot=True,
            cbar_kws={"label":"Color Bar"}, fmt='d',
            xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted Value')
plt.ylabel('True Value')
plt.show()

print(classification_report(y, y_km, target_names=target_names))

# #############################################################################
# Qualitative evaluation of the predictions using matplotlib

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        plt.subplot(n_row, n_col, i + 1)
        plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i], size=12)
        plt.xticks(())
        plt.yticks(())


# plot the result

def title(y_km, y, target_names, i):
    pred_name = target_names[y_km[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

prediction_titles = [title(y_km, y, target_names, i)
                     for i in range(y_km.shape[0])]

plot_gallery(X, prediction_titles, h, w)



In [7]:
# Elbow curve to determine optimal clusters

wcss=[]
K=range(1,7)
for i in K:
    kmeans=KMeans(n_clusters=i, init='k-means++',random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1,7),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

print(df.target)

In [8]:
# Store the Number of clusters along with their WSS Scores in a DataFrame
mycenters = pd.DataFrame({'Clusters' : K, 'WCSS' : wcss})
mycenters

In [9]:
# Silhouette score is used to evaluate the quality of clusters 

for n_clusters in range(2,8):
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 1], s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='red')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=250, edgecolor='red')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='red')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()

In [10]:
#  Instantiate Kmeans

kmeans = KMeans(n_clusters = 3, init = "k-means++", random_state = 10)
y_kmeans = kmeans.fit(X)
clus=kmeans.cluster_centers_
y_km=kmeans.fit_predict(X)
y_km
plt.scatter(X[:, 0], X[:,2], c=y_km, cmap='Wistia')
plt.scatter(clus[:,0], clus[:,1], marker='*', s=200, color='black')