## 1. Load and Explore Data

In [None]:
# Load the Pandas libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Load visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Load clustering libraries
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
# Read World Happiness Report 2021 dataset
raw_data = pd.read_csv(r'C:\Users\hp\Desktop\happiness_score_dataset.csv')
print(raw_data.shape)

In [None]:
raw_data.head()

In [None]:
# Show first 5 rows of header dataframe
raw_data.dtypes

In [None]:
## 2. Correlation Analysis
# Split data into header and skills dataframes
header = pd.DataFrame()
X_data = pd.DataFrame()

for col in raw_data.columns:
    if col in ["Country name", "Regional indicator"]:
        header[col] = raw_data[col]
    else:
        X_data[col] = raw_data[col]

In [None]:
# Standardizing of the data
X_std = StandardScaler().fit_transform(X_data)

# Show standardized data
X_std_df = pd.DataFrame(data=X_std, columns=X_data.columns)
X_std_df.head()

In [None]:
# Calculate correlations between numerical columns
corr_data = X_std_df.corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr_data, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(14, 14))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n=9)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_data, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right');

# Add title
ax.set_title("Correlation Triangle", fontsize=16)
plt.show()

In [None]:
## 3. Dimensionality Reduction
# Principal Component Analysis
pca = PCA(n_components=5)
pca_data = pca.fit_transform(X_std_df)

In [None]:
#Create and show principal components DataFrame
pca_df = pd.DataFrame(data=pca_data, columns=["PC1", "PC2", "PC3", "PC4", "PC5"])
pca_df = pd.concat([pca_df, header[["Country name"]]], axis=1)
pca_df = pca_df[pca_df["PC1"].notnull()]
pca_df

In [None]:
# Create 3D scatter plot
fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111, projection="3d")

# Get (x, y, z) axis values
xx = pca_df["PC1"].values
yy = pca_df["PC2"].values
zz = pca_df["PC3"].values

# Plot values
ax.scatter(xx, yy, zz, c="#1f77b4", marker="o", s = 75)

# Add annotations one by one with a loop
for ix in range(0, len(X_std_df)):
    ax.text(float(xx[ix]), float(yy[ix]), float(zz[ix]), pca_df['Country name'][ix], 
            horizontalalignment="left", size="medium", color="black", weight="normal")

# Plot setup
ax.set_xlabel("PC 1", fontsize = 12)
ax.set_ylabel("PC 2", fontsize = 12)
ax.set_zlabel("PC 3", fontsize = 12)
ax.set_title("PCA 3D Data", fontsize=16)
ax.grid()

In [None]:
### 3.2. PCs Dependencies
# Show correlation between components
fig, ax = plt.subplots(figsize = (8, 8))
sns.heatmap(pca_df.corr(), square=True, annot=True)
ax.set_title("Correlation between Components", fontsize=16)
plt.show()

In [None]:
# Create a matshow plot of the Principal Components dependencies
fig = plt.figure(figsize=(16, 2))
plt.matshow(pca.components_, cmap="viridis", fignum=fig.number, aspect="auto")
plt.yticks([0, 1, 2, 3, 4], ["PC1", "PC2", "PC3", "PC4", "PC5"], fontsize=10)
plt.colorbar()
plt.xticks(range(len(X_std_df.columns)), X_std_df.columns, rotation=65, ha="left")
plt.show()

In [None]:
# The explained variance tells us how much information (variance) can be attributed to each of the principal components
list(pca.explained_variance_ratio_)

In [None]:
# Create horizontal bar chart data
bars = ("PC1", "PC2", "PC3", "PC4", "PC5")
y_pos = np.arange(len(bars))
values = pca.explained_variance_ratio_ * 100
cum = np.cumsum(values)

In [None]:
# Set up the matplotlib figure
fig, ax2 = plt.subplots(figsize=(12, 10))

plt.bar(y_pos, values, align="center", alpha=0.7)
plt.xticks(y_pos, bars)
plt.plot(y_pos, cum, color="orange", linewidth=2, marker="o")
plt.title("Variance Ratio By Component", fontsize=16)

# Add bar labels
for i, v in enumerate(cum):
    ax2.text(i - .15, v + 1, (str(round(v, 1))+"%"), color = "black", fontweight = "normal", fontsize = 11)

# Plot setup
plt.xlabel("Components", fontsize=12)
plt.ylabel("Explained variance in percent", fontsize=12)
plt.legend(("Cum", "Var"), loc="best")
plt.show()

In [None]:
## 4. Clustering: apply 3 different approaches
# Getting the values and plotting it
k = 3
x = pca_df['PC1'].values
y = pca_df['PC2'].values
train_data = np.array(list(zip(x, y)))
#train_data = pca_data

n_data = len(train_data)

In [None]:
def plot_clusters(method, data, labels, centroids=None):
    fig, ax = plt.subplots(figsize=(10, 10))
    
    # Plotting vars
    colors = ["#1f77b4", "#d62728", "#2ca02c", "#ff7f0e", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

    # Create scatter plot
    for i in range(k):
        points = np.array([data[j] for j in range(n_data) if labels[j] == i])
        sns.scatterplot(ax=ax, x=points[:, 0], y=points[:, 1], size=5, color=colors[i])
    
    if not centroids is None:
        plt.scatter(centroids[:, 0], centroids[:, 1], color="black", marker="D")

    # Plot setup
    ax.set_xlabel("PC 1", fontsize=12)
    ax.set_ylabel("PC 2", fontsize=12)
    ax.set_title("Clustering by " + method, fontsize=16)
    ax.legend("")
    ax.grid()

In [None]:
### 4.1. By partitioning: K-means
# Apply k-Means
kmeans = KMeans(n_clusters=k, algorithm="elkan", random_state=0)
kmeans = kmeans.fit(train_data)

# Centroid values
centroids = kmeans.cluster_centers_

# Getting the cluster labels
clusters = kmeans.predict(train_data)
clusters

In [None]:
plot_clusters("K-Means", train_data, clusters)

In [None]:
### 4.2. By Hierarchy: Hierarchical Agglomerative Clustering
linked = linkage(train_data, 'single')
labelList = range(1, n_data+1)

In [None]:
plt.figure(figsize=(16, 8))
dendrogram(linked, orientation='top', labels=labelList, distance_sort='descending', show_leaf_counts=True)
plt.show()

In [None]:
# Apply Hierarchical Agglomerative Clustering
hac = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
cluster = hac.fit_predict(pca_data)
cluster

In [None]:
plot_clusters("Hierarchy", train_data, cluster)

In [None]:
### 4.3. By density: BSCAN
dbscan = DBSCAN(eps=1.2, min_samples=6).fit(pca_data)
clusters = dbscan.labels_
clusters = abs(clusters)
clusters