Importing libraries and data

In [52]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#for scaling
from sklearn.preprocessing import StandardScaler

#for clustering. we will use scipy today
from sklearn.cluster import KMeans  #using scikit-learn
from scipy.cluster.vq import kmeans, vq  #using scipy


data=pd.read_csv('Lecture8.csv',index_col='Country')

Display all columns and rows, adjust image size

In [67]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)

from pylab import rcParams
rcParams['figure.figsize']=12,6
rcParams['figure.dpi']=300

Standard scaling of data: turn values into z-scores

In [None]:
scaler = StandardScaler()
scaler.fit_transform(data)

data=pd.DataFrame(scaler.fit_transform(data),columns=data.columns, index=data.index)


Visualization with PCA - how likely will we get a good clustering outcome?


In [None]:
#Reduce data to 2D with PCA
from sklearn.decomposition import PCA as sklearnPCA

pca = sklearnPCA(n_components=2) #specify no. of components
pca.fit_transform(data)

#how much variance each component explains
pca.explained_variance_ratio_
#cumulative variance explained by all components
pca.explained_variance_ratio_.cumsum()
#information loss
1 - pca.explained_variance_ratio_.cumsum()[1]

#for convenience, save reduced data to new dataframe
reduced_data = pd.DataFrame(pca.fit_transform(data), columns=['Dim_1','Dim_2'], index=data.index)

#plot reduced data
sns.scatterplot(reduced_data, x='Dim_1', y='Dim_2')
plt.title('Countries along 2D (reduced from 4D)')

#access linear combinations of principal components
pca.components_


Try T-SNE method (t-distributed Stochastic Neighbor Embedding)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)  #t-SNE is stochastic and involves randomization, so set random_state for reproducibility
X_tsne = tsne.fit_transform(data)

reduced_t = pd.DataFrame(tsne.fit_transform(data), columns=['Dim_1','Dim_2'], index=data.index)

sns.scatterplot(x='Dim_1', y='Dim_2', data=reduced_t, s=30)
plt.title('Countries along 2D (reduced from 4D) [t-SNE]')


Try 3D visualization (Reduce data to 3D)

In [None]:
pca = sklearnPCA(n_components=3)
data_3D = pca.fit_transform(data)


from mpl_toolkits.mplot3d import Axes3D

ax = plt.axes(projection='3d')
ax.view_init(elev=20, azim=10)

ax.scatter(data_3D[:,0], data_3D[:,1], data_3D[:,2])

ax.set_xlabel('Dim_1')
ax.set_ylabel('Dim_2')
ax.set_zlabel('Dim_3')
ax.set_title('Countries along 3D (reduced from 4D)')


#information loss
1 - pca.explained_variance_ratio_.cumsum()[2]

Find OPTIMAL K (no. of clusters) WITH ELBOW METHOD

In [None]:
#determine optimal number of clusters with the 'elbow method'
num_clusters = np.arange(1, 11)
distortion_values = []

for i in num_clusters:
    #Iterate over each k, train model, and calculate distortion
    cluster_centers, distortion = kmeans(data, i)
    distortion_values.append(distortion)

#generate 'elbow plot'
plt.plot(num_clusters, distortion_values, '-o')
plt.xlabel('number of clusters, k',fontsize=15)
plt.ylabel('distortion value',fontsize=15)
plt.title('Elbow plot',fontsize=15)
plt.xticks(num_clusters, fontsize=15)
plt.show()


#what happens when no. of clusters = sample size?
num_clusters = np.arange(1, len(data)+1)
distortion_values = []

for i in num_clusters:
    cluster_centers, distortion = kmeans(data, i)
    distortion_values.append(distortion)

plt.plot(num_clusters, distortion_values, '-o')
plt.xlabel('number of clusters, k',fontsize=15)
plt.ylabel('distortion value',fontsize=15)
plt.title('Elbow plot',fontsize=15)
plt.xticks(num_clusters, fontsize=6, rotation=60)
plt.show()



Find OPTIMAL k WITH SILHOUETTE SCORES

In [None]:
from sklearn.metrics import silhouette_score

# Same as above, loop through number of clusters, but start with 2 since silhouette scores need at least 2 clusters to be calculated
num_clusters = np.arange(2, 11)

silhouette_scores = []

for i in num_clusters:
    # Iterate over each k, train model, and calculate silhouette score
    cluster_centers, distortion = kmeans(data, i)
    labels = vq(data, cluster_centers)[0]
    silhouette_scores.append(silhouette_score(data, labels))

#generate silhouette plot
plt.plot(num_clusters,  silhouette_scores, '-o')
plt.xlabel('number of clusters, k',fontsize=15)
plt.ylabel('silhouette scores',fontsize=15)
plt.title('Silhouette plot',fontsize=15)
plt.xticks(num_clusters, fontsize=15)
plt.show()



Generate CLUSTER CENTRES and CLUSTER LABELS

In [None]:
#generate cluster centers and labels using the optimal number of clusters
cluster_centers, distortion = kmeans(data, 3)    #distortion = distortion value for specified k
data['cluster'], distances = vq(data, cluster_centers)  #distances = distance of each datapoint to its cluster center


#group countries by cluster and view
data.groupby('cluster').get_group(0)
data.groupby('cluster').get_group(1)
data.groupby('cluster').get_group(2)


#a loop to do so
groups = data.groupby('cluster')
for group_name, group_data in groups:
    print(f"Group {group_name}:")
    print(group_data)
    print()


#verify that cluster centers are the mean feature scores of all its members
cluster_centers
data.groupby('cluster').mean()


#view cluster information in terms of original, non-scaled features
data2=pd.read_csv('Lecture8.csv',index_col='Country')  #reload original data and call it data2
data2['cluster'] = data['cluster']  #reattach cluster labels to data2
data2.groupby('cluster').mean()  #show cluster centers in terms of original features


Visualize clustering outcome

In [None]:
#also reduce cluster centers to two dimensions. No need to do so if there are only two clusters
pca = sklearnPCA(n_components=2)
center_2D = pca.fit_transform(cluster_centers).T    #T = transpose the matrix. An important step

#generate scatterplot with cluster centres
sns.scatterplot(reduced_data, x='Dim_1', y='Dim_2', hue=data['cluster'], palette='bright', s=30)
plt.plot(center_2D[0],center_2D[1],'rX',markersize=12)   #'rX' = red cross

#include this block of code to annotate each object
for i in range(len(reduced_data)):
    plt.text(x=reduced_data.Dim_1[i]+0.05, y=reduced_data.Dim_2[i]+0.05, s=reduced_data.index[i], size=8)
plt.legend(title='Cluster')
plt.xlabel("Dim_1")
plt.ylabel("Dim_2")
plt.title("Annotated clustering outcome")
plt.show()


Visualize clustering outcome in 3D

In [None]:
#reduce cluster centers to three dimensions
pca = sklearnPCA(n_components=3)
center_3D = pca.fit_transform(cluster_centers).T
data_3D = pca.fit_transform(data)


#generate plot with cluster centres
ax = plt.axes(projection='3d')
ax.view_init(elev=20, azim=10)

colors = ['purple', 'orange', 'green']
# Plot each cluster in 3D
for i, color in enumerate(colors):
    # Only select data points that belong to the current cluster
    ix = np.where(data['cluster'] == i)
    ax.scatter(data_3D[ix, 0], data_3D[ix, 1], data_3D[ix, 2], c=color, label=f'Cluster {i+1}')

# Plot cluster centers
ax.scatter(center_3D[0], center_3D[1], center_3D[2], c='red', marker='x', s=150, label='Centroids')

ax.legend()
ax.set_xlabel('Dim_1')
ax.set_ylabel('Dim_2')
ax.set_zlabel('Dim_3')
ax.set_title('3D clustering outcome',fontsize=15)


Evaluate clustering outcome

In [None]:
#Calculate intra-cluster similarity (WCSS)
from scipy.spatial.distance import cdist

distances = cdist(data.iloc[:,:-1], cluster_centers, 'euclidean')  #calculate pairwise distance between all points and all centers. use iloc to remove cluster labels first!
np.sum(np.min(distances, axis=1)**2)  #for each point, pick the minimum distance; i.e. its cluster center. Then square and sum up all points


#Calcluate inter-cluster dissimilarity
inter_cluster_distances = cdist(cluster_centers, cluster_centers, 'euclidean')  #calculate pairwise distance between all centers
np.max(inter_cluster_distances)  #the maximum distance between any two clusters denotes inter-cluster dissimilarity


#Visualize cluster centers for distinctiveness
data.groupby('cluster').mean().plot(kind='bar', rot=0)
plt.legend(loc='best')



#OPTIONAL: Confirm with ANOVA / regression if desired
import statsmodels.api as sm
from statsmodels.formula.api import ols


mapping = {0: 'A', 1: 'B', 2: 'C'}
data['cluster'] = data['cluster'].replace(mapping)

model = ols('GDP_pc ~ cluster', data).fit()  #try different features

model.summary() #check p-value
sm.stats.anova_lm(model)  #alternative ANOVA table


SEMINAR 8 CODE: Use only if you're REALLY stuck

In [None]:
df = pd.read_csv('Seminar8.csv', index_col='Country')
df2 = df.iloc[:, :-1]

df2=pd.DataFrame(scaler.fit_transform(df2),columns=df2.columns, index=df2.index)

sns.scatterplot(df2, x='Happiness', y='GDP_log')


"""
Find optimal k with elbow method
"""
num_clusters = np.arange(1, 11)
distortion_values = []

for i in num_clusters:
    #Iterate over each k, train model, and calculate distortion
    cluster_centers, distortion = kmeans(df2, i)
    distortion_values.append(distortion)

#generate 'elbow plot'
plt.plot(num_clusters, distortion_values, '-o')
plt.xlabel('number of clusters, k',fontsize=15)
plt.ylabel('distortion value',fontsize=15)
plt.title('Elbow plot',fontsize=15)
plt.xticks(num_clusters, fontsize=15)
plt.show()


"""
Find optimal k with silhouette scores
"""
num_clusters = np.arange(2, 11)
silhouette_scores = []

for i in num_clusters:
    # Iterate over each k, train model, and calculate silhouette score
    cluster_centers, distortion = kmeans(df2, i)
    labels = vq(df2, cluster_centers)[0]
    silhouette_scores.append(silhouette_score(df2, labels))

#generate silhouette plot
plt.plot(num_clusters,  silhouette_scores, '-o')
plt.xlabel('number of clusters, k',fontsize=15)
plt.ylabel('silhouette scores',fontsize=15)
plt.title('Silhouette plot',fontsize=15)
plt.xticks(num_clusters, fontsize=15)
plt.show()



"""
Generate cluster centres and cluster labels
"""
cluster_centers, distortion = kmeans(df2, 3)    #distortion = distortion value for specified k
df2['cluster'], distances = vq(df2, cluster_centers)  #distances = distance of each datapoint to its cluster center

sns.scatterplot(df2, x='Happiness', y='GDP_log', hue='cluster')



"""
Evaluate clustering outcome
"""
#Calculate intra-cluster similarity (WCSS)
distances = cdist(df2.iloc[:,:-1], cluster_centers, 'euclidean')  #calculate pairwise distance between all points and all centers. use iloc to remove cluster labels first!
np.sum(np.min(distances, axis=1)**2)  #for each point, pick the minimum distance; i.e. its cluster center. Then square and sum up all points

#Calcluate inter-cluster dissimilarity
inter_cluster_distances = cdist(cluster_centers, cluster_centers, 'euclidean')  #calculate pairwise distance between all centers
np.max(inter_cluster_distances)  #the maximum distance between any two clusters denotes inter-cluster dissimilarity

#Visualize cluster centers for distinctiveness
df2.groupby('cluster').mean().plot(kind='bar', rot=0)
plt.legend(loc='best')





"""
Train K-NN classifier
"""
# Define outcome and predictors
y = df['Regime']
x = df2[['Happiness','GDP_log', 'cluster']]

"""
Find optimal k
"""
# Setup arrays to store accuracy values
neighbors = np.arange(1, 16)
accuracy = np.empty(len(neighbors))

# Loop over different values of k, fit model, and compute accuracy
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x,y)
    accuracy[i] = knn.score(x, y)

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, accuracy)
plt.xticks(neighbors)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

# Create a kNN classifier and fit it to data. If n_neigbors is not specified, the default value=5
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x,y)
knn.score(x,y)
