In [None]:
import pandas as pd
import warnings
import numpy as np
import seaborn as sns
from sklearn import set_config
set_config(transform_output="pandas")
from preprocessing_uta import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
warnings.simplefilter('ignore')
RSEED = 5



## Preprocessing

In [None]:
df = pd.read_csv("data/AQUASTAT_complete.csv", index_col=0)

In [None]:
pre_split_processor = get_pre_split_processor()
pre_split_processor

In [None]:
df_pre = pre_split_processor.transform(df)

In [None]:
df_pre = df_pre.query("year > 2010")

In [None]:
targets = ['gdp_per_capita', 'water_stress', 'total_population_with_access_to_safe_drinking_water']

Y = df_pre[targets]
X = df_pre.drop(columns=targets)



X_train, X_pretest, Y_train, Y_pretest = train_test_split(X, Y, test_size=0.30, random_state=RSEED, stratify=X.country)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_pretest, Y_pretest, test_size=0.50, random_state=RSEED, stratify=X_pretest.country)

In [None]:
minimal_preprocessor = get_minimal_preprocessor(X_train.columns.to_list())

X_train_min = minimal_preprocessor.fit_transform(X_train)
X_valid_min = minimal_preprocessor.transform(X_valid)

In [None]:
# # fully preprocessed variables with country labels

# label_encoder = LabelEncoder()
# X_train_min['country'] = label_encoder.fit_transform(X_train_min['country'])
# X_valid_min['country'] = label_encoder.fit_transform(X_valid_min['country'])

In [None]:
print(X_train_min.shape)
print(X_valid_min.shape)
# for k,v in X_train_hot.isna().sum().to_dict().items():
#     print(k, v)
print("NaNs in train: ", X_train_min.isna().sum().sum())
print("NaNs in valid: ", X_valid_min.isna().sum().sum())


In [None]:
# ## Clustering - agglomerative
# Import libraries
from scipy.spatial.distance import pdist
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score



### Agglomerative Clustering

In [None]:
# Create a new column 'unique_id' by combining 'country' and 'year'
X_train_min['unique_id'] = X_train_min['country'] + '_' + X_train_min['year'].astype(str)
print("NaNs in train: ", X_train_min.isna().sum().sum())

# Group the data by "country"
grouped_data = X_train_min.groupby('country')

# Initialize an empty linkage matrix
combined_linkage_matrix = None

# Initialize an empty list to store country labels
country_labels = []

# Initialize an empty set to store unique_ids and their corresponding rows
unique_id_set = set()

# Iterate through each group (country)
for country, group in grouped_data:
    # Calculate pairwise distances for the group
    distances = pdist(group.select_dtypes(include=['number']), metric='euclidean')
    
    # Create linkage matrix for the group
    linkage_matrix = hierarchy.linkage(distances, method='weighted')
    
    # Add the 'unique_id' labels to the list for each row in the group
    unique_ids = group['unique_id'].tolist()


   
    # for unique_id in unique_ids:
    #     if unique_id in unique_id_set:
    #         print(f"Duplicate unique_id: {unique_id}")
    #         print(group[group['unique_id'] == unique_id])
    #     else:
    #         unique_id_set.add(unique_id)
    
# Combine the linkage matrices    
    if combined_linkage_matrix is None:
        combined_linkage_matrix = linkage_matrix
    else:
        combined_linkage_matrix = hierarchy.linkage(
            pd.concat([pd.DataFrame(combined_linkage_matrix), pd.DataFrame(linkage_matrix)]),
            method='weighted'
        )
    
    #Add the country labels to the list for each row in the group
    country_labels.extend(group['unique_id'].tolist())

# # print(country_labels)
# #print(combined_linkage_matrix)
# # Print counts for various components
# print("Number of unique IDs:", len(country_labels))
# print("Number of rows unique ID:", X_train_min['unique_id'].count())
# print("Number of leaves in dendrogram:", len(combined_linkage_matrix) + 1)

# Create a dendrogram for the combined linkage matrix with 'unique_id' labels
fig, ax = plt.subplots(figsize=(200, 30))
dend = hierarchy.dendrogram(combined_linkage_matrix)#, labels=country_labels)
plt.savefig("images/mlpr_1804_countries.png", dpi=300)


In [None]:
# old coded with Carmine


 # create id for country and year (country label and year) 
# X_train_min['unique_id'] = X_train_min['country'] + '_' + X_train_min['year'].astype(str)

# # Group the data by "country"
# grouped_data = X_train_min.groupby('country')

# # Initialize an empty linkage matrix
# combined_linkage_matrix = None

# # Initialize an empty list to store country labels
# country_labels = grouped_data.first().index.tolist()
# #country_labels

# all_group_distances = []
# # Iterate through each group (country)
# for country, group in grouped_data:
#     #print(country, group)
#     #print(group.shape)
#     #print(group.iloc[:,2:-1].to_numpy())
#     # Calculate pairwise distances for the group
#     #print(type(group.iloc[:,2:].to_numpy()))

#     distances = pdist(np.asarray(group.iloc[:,2:-1]))
#     all_group_distances.append(distances)

# print(len(all_group_distances))
# #print(array_distances.shape)
# #     # Create linkage matrix for the group
# #linkage_matrix = hierarchy.linkage(array_distances, method='weighted')
    
# #     # Combine the linkage matrices
# #     if combined_linkage_matrix is None:
# #         combined_linkage_matrix = linkage_matrix
# #     else:
# #         combined_linkage_matrix = hierarchy.linkage(
# #             pd.concat([pd.DataFrame(combined_linkage_matrix), pd.DataFrame(linkage_matrix)]),
# #             method='weighted'
# #         )

# # # Print the number of labels and the number of leaves in the dendrogram
# # print("Number of labels:", len(country_labels))
# # print("Number of leaves in dendrogram:", len(combined_linkage_matrix) + 1)

# # # Create a dendrogram for the combined linkage matrix with country labels
# # fig, ax = plt.subplots(figsize=(200, 30))
# # dend = hierarchy.dendrogram(combined_linkage_matrix, labels=X_train_min['unique_id'])
# # plt.savefig("images/mlpr_1804_countries.png", dpi=300)





In [None]:
# Plot the Elbow Method to determine the optimal number of clusters
fig = plt.figure(figsize=(10, 6))
fig.patch.set_facecolor('#f6f5f5')
plt.plot(range(1, 25), clusters)
plt.title('The Elbow Method', fontsize=20)
plt.xlabel('No. of Clusters')
plt.ylabel('Inertia (Within-cluster Sum of Squares)')
fig.text(0.5, 0.4, "Identify the best K-value for dummified countries")
plt.show()



### Silhouette scores (to be adapted later )

#### Labeled Silhouette Scores

In [None]:
# Step 5: Silhouette Score for Optimal K with Numeric Labels (Scaled Data)

silhouette_scores = []

K_max = 15  # Maximum number of clusters to consider
for k in range(2, K_max + 1):
    kmeans = KMeans(n_clusters=k, random_state=RSEED)
    cluster_labels = kmeans.fit_predict(X_train_lab)
    silhouette_avg = silhouette_score(X_train_lab, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Store the cluster labels for further use
cluster_labels_numeric = cluster_labels



In [None]:
# Plot Silhouette Scores to find the optimal K
plt.figure(figsize=(8, 4))
plt.plot(range(2, K_max + 1), silhouette_scores, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal K with scaled variables incl. one numeric country variable')
plt.show()

In [None]:
# Print Silhouette Scores for different K values
print("Silhouette Scores:", silhouette_scores)



