In [1]:
# Add Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
dataset_path = "./../datasets"

iris_dataset_path = dataset_path + "/iris.csv"                                         
ai_global_index_path = dataset_path + "/AI_index_db.csv"
global_earthquake_data_path = dataset_path + "/earthquakes.csv"

datasets = {
    "iris": pd.read_csv(iris_dataset_path),
    "ai_global_index": pd.read_csv(ai_global_index_path),
    "global_earthquake": pd.read_csv(global_earthquake_data_path)
}


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

import numpy as np

def affinity_propagation(S, damping=0.5, preference=None, max_iter=200, convergence_iter=15):
    """
    Custom Affinity Propagation implementation with full parameter support.
    
    Args:
        S: Similarity matrix (precomputed, shape [n_samples, n_samples])
        damping: Damping factor (0.5-1.0), same as sklearn's 'damping'
        preference: Preference for exemplars (if None, uses median(S) like sklearn)
        max_iter: Maximum iterations
        convergence_iter: Early stopping if no change
        
    Returns:
        Dictionary of {exemplar_index: [list_of_member_indices]}
    """
    n = S.shape[0]
    
    # Set preference (like sklearn's default)
    if preference is None:
        preference = np.median(S)
    
    # Initialize diagonal of S with preferences
    np.fill_diagonal(S, preference)
    
    # Initialize messages
    R = np.zeros((n, n))  # Responsibilities
    A = np.zeros((n, n))  # Availabilities
    
    exemplars_prev = np.zeros(n, dtype=int)
    stable_count = 0
    
    for iteration in range(max_iter):
        # ===== Responsibilities Update =====
        # Compute AS = A + S (for stability)
        AS = A + S
        
        # Find max values (excluding diagonal)
        max_indices = np.argmax(AS, axis=1)
        max_values = AS[np.arange(n), max_indices]
        
        # Set max values to -inf for secondary max calculation
        AS_copy = AS.copy()
        AS_copy[np.arange(n), max_indices] = -np.inf
        secondary_max = np.max(AS_copy, axis=1)
        
        # Update responsibilities with damping
        R_new = S - max_values[:, np.newaxis]
        R_new[np.arange(n), max_indices] = S[np.arange(n), max_indices] - secondary_max
        R = damping * R + (1 - damping) * R_new
        
        # ===== Availabilities Update =====
        # Compute positive responsibilities
        Rp = np.maximum(R, 0)
        np.fill_diagonal(Rp, R.diagonal())  # Keep self-responsibility
        
        # Update availabilities with damping
        A_new = np.sum(Rp, axis=0) - Rp
        A_new = np.minimum(A_new, 0)
        
        # Self-availability update
        A_new.flat[::n+1] = np.sum(Rp, axis=0) - np.diag(Rp)
        
        A = damping * A + (1 - damping) * A_new
        
        # ===== Convergence Check =====
        current_exemplars = np.argmax(A + R, axis=1)
        if np.array_equal(current_exemplars, exemplars_prev):
            stable_count += 1
            if stable_count >= convergence_iter:
                break
        else:
            stable_count = 0
            
        exemplars_prev = current_exemplars
    
    # ===== Cluster Assignment =====
    exemplars = np.unique(current_exemplars)
    clusters = {e: [] for e in exemplars}
    
    for i in range(n):
        clusters[current_exemplars[i]].append(i)
    
    return clusters

In [5]:
def find_optimum_metrics(datasets):
    results = {}
    
    for name, df in datasets.items():
        print(f"\n=== Processing {name} dataset ===")
        df = df.dropna()
        X = StandardScaler().fit_transform(df.select_dtypes(include=[np.number]))
        
        param_grid = {
            'damping': [0.5, 0.7, 0.9],
            'preference': [-200, -100, -50, -10]  # Wider range for preference
        }
        
        best_params = None
        best_metrics = {
            'Silhouette Score': -1,
            'Davies-Bouldin Index': float('inf'),
            'Calinski-Harabasz Index': -1,
            'n_clusters': 0
        }
        
        for damping, preference in product(param_grid['damping'], param_grid['preference']):
            print(f"\nTesting damping={damping}, preference={preference}")
            
            try:
                ap = AffinityPropagation(damping=damping, preference=preference, random_state=42)
                labels = ap.fit_predict(X)
                
                # Handle noise points (label = -1)
                unique_labels = set(labels)
                n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
                
                if n_clusters < 2:
                    print(f"Only {n_clusters} clusters formed - skipping")
                    continue
                    
                print(f"Formed {n_clusters} clusters")
                
                metrics = {
                    'Silhouette Score': silhouette_score(X, labels),
                    'Davies-Bouldin Index': davies_bouldin_score(X, labels),
                    'Calinski-Harabasz Index': calinski_harabasz_score(X, labels),
                    'n_clusters': n_clusters
                }
                
                print("Current metrics:", {k: round(v, 4) if isinstance(v, float) else v 
                                         for k, v in metrics.items()})
                
                # Update best metrics (prioritizing Silhouette Score)
                if metrics['Silhouette Score'] > best_metrics['Silhouette Score']:
                    best_params = {'damping': damping, 'preference': preference}
                    best_metrics.update(metrics)
                    print("⭐ New best parameters found!")
                    
            except Exception as e:
                print(f"❌ Clustering failed: {str(e)}")
                continue
                
        # Store and display final results
        results[name] = {
            'Best Parameters': best_params,
            'Best Metrics': best_metrics
        }
        
        print("\n=== Final Results ===")
        print(f"Dataset: {name}")
        print("Optimal Parameters:", best_params)
        print("Best Metrics:")
        for k, v in best_metrics.items():
            print(f"- {k}: {round(v, 4) if isinstance(v, float) else v}")
        print("="*50)
    
    return results

# Usage
results = find_optimum_metrics({'ai_global_index': datasets.get('ai_global_index')})


=== Processing ai_global_index dataset ===


NameError: name 'product' is not defined

In [6]:
results = find_optimum_metrics({'iris': datasets.get('iris')})


=== Processing iris dataset ===


NameError: name 'product' is not defined

In [7]:
results = find_optimum_metrics({'global_earthquake': datasets.get('global_earthquake')})


=== Processing global_earthquake dataset ===


NameError: name 'product' is not defined

In [8]:
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (adjusted_rand_score, silhouette_score, 
                           davies_bouldin_score, calinski_harabasz_score)

def compare_affinity_implementations(datasets, best_parameters):
    """
    Compare custom and sklearn implementations of Affinity Propagation.
    
    Args:
        datasets: Dictionary of {dataset_name: dataframe}
        best_parameters: Dictionary of {dataset_name: parameters_dict}
                        where parameters_dict contains 'damping' and 'preference'
    
    Returns:
        Dictionary containing comparison results for all datasets
    """
    all_results = {}

    for name, df in datasets.items():
        print(f"\n{'='*50}\nProcessing dataset: {name}\n{'='*50}")
        df = df.dropna()

        # Get best parameters for this dataset
        params = best_parameters.get(name, {})
        damping = params.get('damping', 0.5)
        preference = params.get('preference', None)
        
        print(f"Using parameters - Damping: {damping}, Preference: {preference}")

        # Extract numerical features and normalize
        X = df.select_dtypes(include=[np.number]).values
        X = StandardScaler().fit_transform(X)
        
        # Compute similarity matrix
        S = -np.square(np.linalg.norm(X[:, None] - X[None, :], axis=2))
        
        # If preference is None, use median similarity (like sklearn default)
        if preference is None:
            preference = np.median(S)
            print(f"Using computed preference: {preference:.2f}")
        
        results = {'Custom Affinity': {}, 'Sklearn Affinity': {}}
        
        # Custom Implementation
        try:
            print("\nRunning custom Affinity Propagation...")
            custom_clusters = affinity_propagation(S, damping=damping, preference=preference)
            custom_labels = np.zeros(len(X), dtype=int)
            
            for cluster_id, members in enumerate(custom_clusters.values()):
                for idx in members:
                    custom_labels[idx] = cluster_id
            
            n_custom_clusters = len(set(custom_labels))
            print(f"Custom AP created {n_custom_clusters} clusters")
            
            if n_custom_clusters < 2 or n_custom_clusters >= len(X):
                raise ValueError(f"Invalid cluster count: {n_custom_clusters}")
                
            results['Custom Affinity'] = {
                "ARI": None,  # Will fill after sklearn runs
                "Silhouette": silhouette_score(X, custom_labels),
                "DBI": davies_bouldin_score(X, custom_labels),
                "CHI": calinski_harabasz_score(X, custom_labels),
                "n_clusters": n_custom_clusters
            }
            
        except Exception as e:
            print(f"Custom AP failed: {str(e)}")
            results['Custom Affinity'] = {
                "ARI": np.nan, "Silhouette": np.nan, 
                "DBI": np.nan, "CHI": np.nan, "n_clusters": 0
            }

        # Sklearn Implementation
        try:
            print("\nRunning sklearn Affinity Propagation...")
            sklearn_ap = AffinityPropagation(
                affinity='precomputed',
                damping=damping,
                preference=preference,
                random_state=42
            )
            sklearn_labels = sklearn_ap.fit_predict(S)
            
            n_sklearn_clusters = len(set(sklearn_labels))
            print(f"Sklearn AP created {n_sklearn_clusters} clusters")
            
            if n_sklearn_clusters < 2 or n_sklearn_clusters >= len(X):
                raise ValueError(f"Invalid cluster count: {n_sklearn_clusters}")
                
            results['Sklearn Affinity'] = {
                "ARI": adjusted_rand_score(sklearn_labels, custom_labels) if 'custom_labels' in locals() else np.nan,
                "Silhouette": silhouette_score(X, sklearn_labels),
                "DBI": davies_bouldin_score(X, sklearn_labels),
                "CHI": calinski_harabasz_score(X, sklearn_labels),
                "n_clusters": n_sklearn_clusters
            }
            
            # Update ARI for custom if sklearn succeeded
            if 'custom_labels' in locals():
                results['Custom Affinity']['ARI'] = adjusted_rand_score(sklearn_labels, custom_labels)
                
        except Exception as e:
            print(f"Sklearn AP failed: {str(e)}")
            results['Sklearn Affinity'] = {
                "ARI": np.nan, "Silhouette": np.nan, 
                "DBI": np.nan, "CHI": np.nan, "n_clusters": 0
            }

        # Print results
        print("\nComparison Results:")
        for method in results:
            print(f"\n{method}:")
            for metric, value in results[method].items():
                if isinstance(value, float):
                    print(f"{metric:>12}: {value:.4f}")
                else:
                    print(f"{metric:>12}: {value}")
        
        all_results[name] = results

    # Save results
    results_df = pd.DataFrame.from_dict(
        {(dataset, method): metrics 
         for dataset in all_results 
         for method, metrics in all_results[dataset].items()},
        orient='index'
    )
    results_df.to_csv("./../results/affinity_comparison_metrics.csv")
    print("\nSaved results to affinity_comparison_metrics.csv")
    
    return all_results


# Example Usage
if __name__ == "__main__":
    # Load your datasets
    

    # Define the best parameters for Affinity Propagation
    best_parameters = {
        'iris': {'damping': 0.7, 'preference': -100},
        'ai_global_index': {'damping': 0.9, 'preference': -10},
        'global_earthquake': {'damping': 0.5, 'preference': -200},
    }

    # Compare implementations
    results = compare_affinity_implementations(datasets, best_parameters)


Processing dataset: iris
Using parameters - Damping: 0.7, Preference: -100

Running custom Affinity Propagation...
Custom AP created 2 clusters

Running sklearn Affinity Propagation...
Sklearn AP created 2 clusters

Comparison Results:

Custom Affinity:
         ARI: 1.0000
  Silhouette: 0.5802
         DBI: 0.5976
         CHI: 248.9034
  n_clusters: 2

Sklearn Affinity:
         ARI: 1.0000
  Silhouette: 0.5802
         DBI: 0.5976
         CHI: 248.9034
  n_clusters: 2

Processing dataset: ai_global_index
Using parameters - Damping: 0.9, Preference: -10

Running custom Affinity Propagation...
Custom AP created 8 clusters

Running sklearn Affinity Propagation...
Sklearn AP created 2 clusters

Comparison Results:

Custom Affinity:
         ARI: 0.0143
  Silhouette: 0.2562
         DBI: 0.8951
         CHI: 33.1685
  n_clusters: 8

Sklearn Affinity:
         ARI: 0.0143
  Silhouette: 0.7358
         DBI: 0.1731
         CHI: 25.9479
  n_clusters: 2

Processing dataset: global_earthqua

In [9]:
# Example Usage
if __name__ == "__main__":
    # Load your datasets
    

    # Define the best parameters for Affinity Propagation
    best_parameters = {
        'iris': {'damping': 0.7, 'preference': -100},
        'ai_global_index': {'damping': 0.9, 'preference': -10},
        'global_earthquake': {'damping': 0.5, 'preference': -200},
    }

    # Prepare datasets dictionary
    

    # Compare implementations
    results = compare_affinity_implementations(datasets, best_parameters)


Processing dataset: iris
Using parameters - Damping: 0.7, Preference: -100

Running custom Affinity Propagation...
Custom AP created 2 clusters

Running sklearn Affinity Propagation...
Sklearn AP created 2 clusters

Comparison Results:

Custom Affinity:
         ARI: 1.0000
  Silhouette: 0.5802
         DBI: 0.5976
         CHI: 248.9034
  n_clusters: 2

Sklearn Affinity:
         ARI: 1.0000
  Silhouette: 0.5802
         DBI: 0.5976
         CHI: 248.9034
  n_clusters: 2

Processing dataset: ai_global_index
Using parameters - Damping: 0.9, Preference: -10

Running custom Affinity Propagation...
Custom AP created 8 clusters

Running sklearn Affinity Propagation...
Sklearn AP created 2 clusters

Comparison Results:

Custom Affinity:
         ARI: 0.0143
  Silhouette: 0.2562
         DBI: 0.8951
         CHI: 33.1685
  n_clusters: 8

Sklearn Affinity:
         ARI: 0.0143
  Silhouette: 0.7358
         DBI: 0.1731
         CHI: 25.9479
  n_clusters: 2

Processing dataset: global_earthqua