In [3]:
import numpy as np
import pandas as pd
import sklearn.preprocessing, sklearn.cluster, sklearn.metrics
import scipy.spatial
import matplotlib.pyplot as plt
import seaborn as sns

# Data description

Data source: https://www.kaggle.com/datasets/subhajitnayak/country-data?resource=download&select=Country-data.csv

|Attribute|Description|
|:-------|:-------|
|country | Name of the country |
|child_mort | Death of children under 5 years of age per 1000 live births |
|exports | Exports of goods and services. Given as %age of the Total GDP |
|health | Total health spending as %age of Total GDP |
|imports | Imports of goods and services. Given as %age of the Total GDP |
|Income | Net income per person |
|Inflation | The measurement of the annual growth rate of the Total GDP |
|life_expec | The average number of years a new born child would live if the current mortality patterns are to rem... |
|total_fer | The number of children that would be born to each woman if the current age-fertility rates remain th... |
|gdpp | The GDP per capita. Calculated as the Total GDP divided by the total population. |

# Data loading

In [None]:
df = pd.read_csv('data_clustering/country-data.csv')
df

In [None]:
df.describe()

In [None]:
df_only_numeric = df.select_dtypes(np.number)

# Create the subplots
n_rows, n_cols = df_only_numeric.shape
fig, axes = plt.subplots(nrows=1, ncols=n_cols, figsize=(40, 6))
for i, column in enumerate(df_only_numeric):
    sns.histplot(data=df_only_numeric, x=column, ax=axes[i]).set_title(column)

# Data preprocessing
- Why do we need to preprocess the data?

In [None]:
X = df_only_numeric.values

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler()
X_min_max_scaled = scaler.fit_transform(X)
X_min_max_scaled

# Clustering

## KMeans + clustering quality
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
- Do you know any technique to determine the best number of clusters?

In [None]:
clustering = sklearn.cluster.KMeans(n_clusters=5)
clustering.fit(X_min_max_scaled)

In [None]:
clustering.labels_

In [None]:
pd.Series(clustering.labels_).value_counts()

In [None]:
clustering.inertia_

In [None]:
sklearn.metrics.silhouette_score(X_min_max_scaled, clustering.labels_)

## Elbow method

#### Calculate SSE and Silhouette for differenet parameters of clusters $k \in <2, 15>$. Create visualization for both of clustering quality criteria

In [None]:
clustering_scores = []
for k in range(2, 11):
    clustering = sklearn.cluster.KMeans(n_clusters=k).fit(X_min_max_scaled)
    clustering_scores.append({
        'k': k,
        'sse': clustering.inertia_,
        'silhouette': sklearn.metrics.silhouette_score(X_min_max_scaled, clustering.labels_)
    })
df_clustering_scores = pd.DataFrame.from_dict(clustering_scores, orient='columns')
df_clustering_scores

In [None]:
sns.lineplot(data=df_clustering_scores, x='k', y='sse')

In [None]:
sns.lineplot(data=df_clustering_scores, x='k', y='silhouette')

#### How many cluster do you see ?

#### Can you imagine other quality criteria to use over some real dataset? Do you know difference between external and internal criteria?

## Clustering interpretation

In [None]:
clustering = sklearn.cluster.KMeans(n_clusters=4, random_state=13)
clustering.fit(X_min_max_scaled)

In [None]:
df['cluster_id'] = clustering.labels_

In [None]:
sns.countplot(data=df, x='cluster_id')

#### Use describe() for quick inspection of numeric values in dataset.

In [None]:
with pd.option_context('display.max_columns', None):
    display(df.groupby('cluster_id').describe())

In [None]:
sns.boxplot(data=df, y='income', x='cluster_id')

#### Are there any differences attributes for different clusters?
Clusters with id 1 and 3 contain countries with lower income. High income countries are mostly placed in small cluster with cluster_id 0, other countries with relative high income are in group 2.

#### Continue with description of detected clusters

## DBSCAN + distance analysis
- https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
- https://www.kdnuggets.com/2020/04/dbscan-clustering-algorithm-machine-learning.html

In [None]:
clustering = sklearn.cluster.DBSCAN()
clustering.fit(X_min_max_scaled)

In [None]:
pd.Series(clustering.labels_).value_counts()

What does the number -1 means?
#### Find better parameters *eps* and *min_pts* and visualize you result

In [None]:
clustering = sklearn.cluster.DBSCAN(eps=0.2, min_samples=5).fit(X_min_max_scaled)
pd.Series(clustering.labels_).value_counts()

#### Lets get insight into distances in our dataset and be able to configure our parameters for DBSCAN

In [None]:
distance_matrix = scipy.spatial.distance_matrix(X_min_max_scaled, X_min_max_scaled)
distance_matrix

In [None]:
sns.histplot(distance_matrix.flatten())

In [None]:
distance_matrix = scipy.spatial.distance_matrix(X_min_max_scaled, X_min_max_scaled)
distance_matrix.sort(axis=1)
distance_matrix

In [None]:
sns.histplot(distance_matrix[:, 1])

In [None]:
sns.histplot(distance_matrix[:, 10], bins=50)

#### What can we notice in previous visualizations?

In [None]:
clustering = sklearn.cluster.DBSCAN(eps=0.25, min_samples=10).fit(X_min_max_scaled)
pd.Series(clustering.labels_).value_counts()

#### Description of clusters (and probably noise points too)

In [None]:
df['cluster_id'] = clustering.labels_

In [None]:
df[df.cluster_id == -1].country.values

In [None]:
df[df.cluster_id == 0].country.values

In [None]:
df[df.cluster_id == 1].country.values

In [None]:
sns.boxplot(data=df, y='income', x='cluster_id')

### How would you treat non-numerical attributes?

### For more info about Sklearn clustering, take a look into documentation https://scikit-learn.org/stable/modules/clustering.html#clustering

# Task (2p)

Continue with clustering analysis:
1. Let's add more information for already detected clusters (code above). Choose one of already used method (k-means or DBSCAN) and add more description for clusters.

MinMax scaling is not the only best possibility for scaling of numerical attributes.

2. Take a look into documentation (https://scikit-learn.org/stable/modules/classes.html?highlight=preprocessing#module-sklearn.preprocessing) and choose other method for scaling your features.

3. Apply clustering method of your choice (https://scikit-learn.org/stable/modules/clustering.html) for differently scaled data.

4. Describe detected clusters.

As a bonus, it may be interesting to combine differently scaled features - e.g. feature1 scaled using MinMax, feature2 scaled using PowerTransform etc. And used this dataset for clustering.