In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

: 

In [None]:
# Load the data
df = pd.read_csv('resources_human_capital_cleaned.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of countries: {len(df)}")
print(f"Number of features: {df.shape[1] - 1}")

# Basic preprocessing
countries = df['Country']
X = df.drop('Country', axis=1)

# Handle missing values
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 1. FIRST STAGE: K-MEANS CLUSTERING

In [None]:
# Determine optimal k
inertia = []
silhouette_scores = []
k_range = range(2, 8)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    from sklearn.metrics import silhouette_score
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot to choose k
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(k_range, inertia, 'bo-')
axes[0].set_xlabel('Number of clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True)

axes[1].plot(k_range, silhouette_scores, 'ro-')
axes[1].set_xlabel('Number of clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Scores')
axes[1].grid(True)
plt.tight_layout()
plt.show()


# Choose optimal k (based on plots - let's use 4)

In [None]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)
print(f"\nK-Means clustering completed with {optimal_k} clusters")
print(f"Cluster distribution: {np.bincount(kmeans_labels)}")

# 2. SECOND STAGE: ANN FOR CLUSTERING

In [None]:
# Prepare data for ANN
X_ann = X_scaled.copy()
y_ann = kmeans_labels.copy()

In [None]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X_ann, y_ann, test_size=0.2, random_state=42, stratify=y_ann
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

In [None]:
# Convert labels to categorical (one-hot encoding)
y_train_categorical = to_categorical(y_train, num_classes=optimal_k)
y_test_categorical = to_categorical(y_test, num_classes=optimal_k)

In [None]:
# Build ANN model
def build_ann_model(input_dim, n_clusters):
    model = models.Sequential([
        # Input layer
        layers.Input(shape=(input_dim,)),
        
        # Hidden layers
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        layers.Dense(32, activation='relu'),
        layers.BatchNormalization(),
        
        # Output layer
        layers.Dense(n_clusters, activation='softmax')
    ])
    
    return model

In [None]:
# Create and compile model
input_dim = X_train.shape[1]
model = build_ann_model(input_dim, optimal_k)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("\nANN Model Architecture:")
model.summary()

In [None]:
# Train the model
print("\nTraining ANN...")
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train_categorical,
    validation_split=0.2,
    epochs=100,
    batch_size=16,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(history.history['loss'], label='Training Loss')
axes[0].plot(history.history['val_loss'], label='Validation Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history.history['accuracy'], label='Training Accuracy')
axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# 3. EVALUATE THE ANN CLUSTERING
print("\n" + "="*50)
print("EVALUATION OF ANN CLUSTERING")
print("="*50)

# Predict on test set
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Cluster')
plt.ylabel('True Cluster (K-Means)')
plt.show()

# 4. COMPARE K-MEANS AND ANN CLUSTERING
print("\n" + "="*50)
print("COMPARISON: K-MEANS vs ANN")
print("="*50)

# Predict cluster assignments for all data using ANN
all_pred_proba = model.predict(X_ann)
all_pred = np.argmax(all_pred_proba, axis=1)

# Compare cluster assignments
agreement = np.mean(kmeans_labels == all_pred)
print(f"Agreement between K-Means and ANN: {agreement:.4f}")

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Country': countries,
    'KMeans_Cluster': kmeans_labels,
    'ANN_Cluster': all_pred,
    'Agreement': kmeans_labels == all_pred
})

print(f"\nDisagreements: {sum(~comparison_df['Agreement'])} out of {len(comparison_df)} countries")

if sum(~comparison_df['Agreement']) > 0:
    print("\nCountries with different cluster assignments:")
    disagreements = comparison_df[~comparison_df['Agreement']]
    print(disagreements[['Country', 'KMeans_Cluster', 'ANN_Cluster']].head(10))


In [None]:
# 5. VISUALIZE CLUSTERS
print("\n" + "="*50)
print("CLUSTER VISUALIZATION")
print("="*50)

# Reduce dimensions for visualization
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_ann)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.2%}")

# Create visualization
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Original K-Means clusters
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, 
                          cmap='viridis', alpha=0.7, s=50)
axes[0].set_xlabel('PCA Component 1')
axes[0].set_ylabel('PCA Component 2')
axes[0].set_title('K-Means Clustering')
axes[0].grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=axes[0])

# ANN predicted clusters
scatter2 = axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=all_pred, 
                          cmap='plasma', alpha=0.7, s=50)
axes[1].set_xlabel('PCA Component 1')
axes[1].set_ylabel('PCA Component 2')
axes[1].set_title('ANN Predicted Clusters')
axes[1].grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=axes[1])

# Highlight disagreements
colors = ['green' if agree else 'red' for agree in comparison_df['Agreement']]
scatter3 = axes[2].scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.7, s=50)
axes[2].set_xlabel('PCA Component 1')
axes[2].set_ylabel('PCA Component 2')
axes[2].set_title('Agreement between K-Means and ANN\n(Green=Agree, Red=Disagree)')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# 6. CLUSTER PROFILING AND INTERPRETATION
print("\n" + "="*50)
print("CLUSTER PROFILING")
print("="*50)

# Add cluster assignments to original data
df_with_clusters = df.copy()
df_with_clusters['KMeans_Cluster'] = kmeans_labels
df_with_clusters['ANN_Cluster'] = all_pred

# Analyze each cluster's characteristics
def analyze_cluster(cluster_id, method='KMeans'):
    cluster_data = df_with_clusters[df_with_clusters[f'{method}_Cluster'] == cluster_id]
    
    # Select key features for profiling
    key_features = [
        'Population_millions_2024',
        'Population density_people per sq. km_2024',
        'Agri_Inputs_Agricultural employment_% of total employment_2020',
        'Edu_Participation_Gross enrollment ratio_Tertiary_% of relevant age group_2022',
        'Energy_Energy use_per capita_kilograms of oil equivalent_2015',
        'Population_Population age composition_Ages 65+_%_2024',
        'Labor_Labor force (ages 15 and older)_Total_millions_2021'
    ]
    
    print(f"\n{method} Cluster {cluster_id} Analysis:")
    print(f"Number of countries: {len(cluster_data)}")
    print(f"Example countries: {', '.join(cluster_data['Country'].head(5).tolist())}")
    
    stats = {}
    for feature in key_features:
        if feature in cluster_data.columns:
            stats[feature] = {
                'mean': cluster_data[feature].mean(),
                'std': cluster_data[feature].std(),
                'min': cluster_data[feature].min(),
                'max': cluster_data[feature].max()
            }
    
    return stats

# Analyze each K-Means cluster
for cluster_id in range(optimal_k):
    analyze_cluster(cluster_id, 'KMeans')

# 7. FEATURE IMPORTANCE ANALYSIS
print("\n" + "="*50)
print("FEATURE IMPORTANCE FOR CLUSTERING")
print("="*50)

# Extract weights from the ANN to understand feature importance
# Get weights from the first layer
first_layer_weights = model.layers[0].get_weights()[0]

# Calculate feature importance as absolute weight sum
feature_importance = np.abs(first_layer_weights).sum(axis=1)

# Create feature importance DataFrame
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort by importance
importance_df = importance_df.sort_values('Importance', ascending=False).head(15)

print("\nTop 15 Most Important Features for ANN Clustering:")
print(importance_df.to_string(index=False))


In [None]:
# Visualize feature importance
plt.figure(figsize=(12, 8))
bars = plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Feature Importance (Weight Magnitude)')
plt.title('Top 15 Features Influencing ANN Clustering Decisions')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# 8. CLUSTER CENTROID ANALYSIS
print("\n" + "="*50)
print("CLUSTER CENTROID ANALYSIS")
print("="*50)

# Get cluster centroids from K-Means
centroids = kmeans.cluster_centers_

# Inverse transform to original scale
centroids_original = scaler.inverse_transform(centroids)
centroids_df = pd.DataFrame(centroids_original, columns=X.columns)

# Select top features for centroid comparison
top_features = importance_df['Feature'].head(8).tolist()

print("\nCluster Centroids for Top Features:")
centroids_top = centroids_df[top_features]
print(centroids_top.round(2))

# Visualize centroid differences
plt.figure(figsize=(12, 8))
centroids_top.T.plot(kind='bar', figsize=(12, 8))
plt.title('Cluster Centroids Comparison for Top Features')
plt.xlabel('Features')
plt.ylabel('Centroid Value (Original Scale)')
plt.legend(title='Cluster')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 9. PREDICTIVE POWER FOR NEW DATA
print("\n" + "="*50)
print("MODEL VALIDATION AND PREDICTIVE POWER")
print("="*50)

# Cross-validation to test model robustness
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neural_network import MLPClassifier

# Use simpler ANN for cross-validation (for speed)
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=100,
    random_state=42,
    early_stopping=True
)

# Perform cross-validation
cv_scores = cross_val_score(mlp, X_ann, y_ann, cv=5, scoring='accuracy')

print(f"Cross-validation Accuracy Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 10. CLUSTER STABILITY ANALYSIS
print("\n" + "="*50)
print("CLUSTER STABILITY ANALYSIS")
print("="*50)

# Test clustering stability with different random seeds
stability_results = []
for seed in range(5):
    kmeans_temp = KMeans(n_clusters=optimal_k, random_state=seed, n_init=10)
    labels_temp = kmeans_temp.fit_predict(X_scaled)
    
    # Train ANN on these labels
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
        X_scaled, labels_temp, test_size=0.2, random_state=42
    )
    
    y_train_cat_temp = to_categorical(y_train_temp, num_classes=optimal_k)
    y_test_cat_temp = to_categorical(y_test_temp, num_classes=optimal_k)
    
    model_temp = build_ann_model(input_dim, optimal_k)
    model_temp.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history_temp = model_temp.fit(
        X_train_temp, y_train_cat_temp,
        validation_split=0.2,
        epochs=50,
        batch_size=16,
        verbose=0
    )
    
    # Evaluate
    y_pred_temp = np.argmax(model_temp.predict(X_test_temp), axis=1)
    accuracy_temp = accuracy_score(y_test_temp, y_pred_temp)
    
    stability_results.append(accuracy_temp)
    print(f"Seed {seed}: Test Accuracy = {accuracy_temp:.4f}")

print(f"\nAverage stability accuracy: {np.mean(stability_results):.4f}")
print(f"Stability std: {np.std(stability_results):.4f}")

# 11. FINAL RECOMMENDATIONS
print("\n" + "="*50)
print("FINAL RECOMMENDATIONS")
print("="*50)

print("\n1. **Hybrid K-Means + ANN Approach Benefits:**")
print("   - ANN learns complex patterns beyond simple distance metrics")
print("   - Model can predict clusters for new data without rerunning K-Means")
print("   - Handles non-linear relationships between features")
print("   - Provides feature importance insights")

print("\n2. **When to Use This Approach:**")
print("   - Large datasets where K-Means alone is computationally expensive")
print("   - When you need to assign new points to existing clusters")
print("   - When clusters have complex, non-linear boundaries")
print("   - When interpretability of clustering decisions is important")

print("\n3. **Limitations:**")
print("   - Requires sufficient training data for ANN")
print("   - ANN predictions depend on K-Means initial clustering")
print("   - More complex than traditional clustering methods")
print("   - Risk of overfitting if ANN is too complex")

print("\n4. **For This Dataset:**")
print(f"   - {optimal_k} clusters identified")
print(f"   - {accuracy:.2%} agreement between K-Means and ANN")
print(f"   - Top features: {importance_df['Feature'].head(3).tolist()}")
print("   - ANN successfully learned the clustering pattern")

# Save the trained model
model.save('kmeans_ann_clustering_model.h5')
print("\nModel saved as 'kmeans_ann_clustering_model.h5'")
