In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # For 3D plotting
import matplotlib.patches as mpatches
plt.rcParams['figure.dpi'] = 120
from PAM import *
from CLARA import *

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_samples

def cluster_analytics(df, medoids_col='medoid', label_col='cluster_label'):
    # Initialize the DataFrame to store cluster analytics
    cluster_analytics = pd.DataFrame()

    # 1. Cluster size
    cluster_sizes = df[label_col].value_counts()
    cluster_analytics['Cluster Size'] = cluster_sizes

    # 2. Average silhouette per cluster
    silhouette_vals = silhouette_samples(df.drop([medoids_col, label_col], axis=1), df[label_col])
    df['silhouette'] = silhouette_vals
    avg_silhouette_scores = df.groupby(label_col)['silhouette'].mean()
    cluster_analytics['Avg Silhouette Score'] = avg_silhouette_scores

    # 3. Average distance from medoid within each cluster
    avg_distances = df.groupby(label_col).apply(lambda x: np.mean(euclidean_distances(x[~x[medoids_col]], [x[x[medoids_col]].iloc[0]])))
    cluster_analytics['Avg Distance from Medoid'] = avg_distances

    # 4. Max distance from the medoid over min distance to other medoids
    max_min_ratios = df.groupby(label_col).apply(lambda x: max(euclidean_distances(x[~x[medoids_col]], [x[x[medoids_col]].iloc[0]])) / min(euclidean_distances([x[x[medoids_col]].iloc[0]], df[df[medoids_col]].drop(x[x[medoids_col]].index))))
    cluster_analytics['Max/Min Distance Ratio'] = max_min_ratios.apply(lambda x: x[0])

    # 5. Add any other measure as required

    return cluster_analytics


### PAM testing

#### Test data

In [None]:
df_dict={'x':[1,5,5,5,10,25,25,25,25,29],
         'y':[4,1,2,4,4,4,6,7,8,7]}

df=pd.DataFrame(df_dict)

In [None]:
df.plot.scatter(x='x',
                y='y',
                c='#157a30')

In [None]:
results = {'Number of Clusters': [], 'Average Silhouette': [], 'Average MSD': []}

for k in range(2, 5+1):
    print(k)
    clustering_model = PAM(df, k)
    clustering_model.fit(verbose=0)

    # Evaluate the clustering
    silhouette_avg, msd_avg = clustering_model.evaluate_clustering_metrics()

    results['Number of Clusters'].append(k)
    results['Average Silhouette'].append(silhouette_avg)
    results['Average MSD'].append(msd_avg)


results_df = pd.DataFrame(results).set_index('Number of Clusters')
results_df

In [None]:
pam=PAM(df,k=2)

pam.fit(verbose=2)

pam.plot_silhouette()

In [None]:
pam.visualize_clusters(method='pca3',scale_factor=4)

In [None]:
pam.generate_report(scale_factor=4,return_report=False, save_markdown=True, save_plots=True,path='./',file_name='test_data_PAM_report.md')

In [None]:
enriched_df=pam.enrich_dataset()

clustering_analytics=cluster_analytics(enriched_df)
clustering_analytics

In [None]:
enriched_df

In [None]:
rupsini_dict = {
    'x': [4,5,10,9,13,13,12,15,18,19,22,27,28,24,27,28,30,31,32,36,28,32,35,33,38,41,38,38,32,34,44,
          44,44,46,47,49,50,53,52,55,54,60,63,86,85,85,78,74,97,98,98,99,99,101,108,110,108,
          111,115,117,70,77,83,61,69,78,66,58,64,69,66,61,76,72,64],
    'y':[53,63,59,77,49,69,88,75,61,65,74,72,76,58,55,60,52,60,61,72,147,149,
        153,154,151,150,145,143,143,141,156,149,143,142,149,152,142,144,152,155,
        124,136,139,132,115,96,94,96,122,116,124,119,128,115,111,111,116,126,117,
        115,4,12,21,15,15,16,18,13,20,21,23,25,27,31,30]
}
rupsini_df=pd.DataFrame(rupsini_dict)
rupsini_df.head()

In [None]:
rupsini_df.plot.scatter(x='x',
                y='y',
                c='#157a30')

In [None]:
results = {'Number of Clusters': [], 'Average Silhouette': [], 'Average MSD': []}

for k in range(2, 10+1):
    print(k)
    clustering_model = PAM(rupsini_df, k)
    clustering_model.fit(verbose=0)

    # Evaluate the clustering
    silhouette_avg, msd_avg = clustering_model.evaluate_clustering_metrics()

    results['Number of Clusters'].append(k)
    results['Average Silhouette'].append(silhouette_avg)
    results['Average MSD'].append(msd_avg)

results_df = pd.DataFrame(results).set_index('Number of Clusters')
results_df

In [None]:
pam=PAM(rupsini_df,k=4)

pam.fit(verbose=2)

pam.plot_silhouette()

In [None]:
pam.visualize_clusters(scale_factor=5)

In [None]:
pam.generate_report(scale_factor=4,return_report=False, save_markdown=True, save_plots=True,path='/content/drive/MyDrive/MBA/Business Analytics/Project',file_name='rupsini_PAM_report.md')

### CLARA Testing

#### Generate test dataset

In [None]:
## Generate test dataset

# Function to generate samples
def generate_samples(mean_x, mean_y, sigma, n):
    mean = [mean_x, mean_y]
    cov = [[sigma**2, 0], [0, sigma**2]]  # Diagonal covariance, for spherical distribution
    return np.random.multivariate_normal(mean, cov, n)

# Generating samples for each set of parameters
data1 = generate_samples(0, 10, 1.7, 120)
data2 = generate_samples(20, 12, 0.7, 60)
data3 = generate_samples(10, 20, 1.0, 20)

# Creating DataFrames
df1 = pd.DataFrame(data1, columns=['x', 'y'])
df2 = pd.DataFrame(data2, columns=['x', 'y'])
df3 = pd.DataFrame(data3, columns=['x', 'y'])

# Concatenating into a single DataFrame
clara_df = pd.concat([df1, df2, df3]).reset_index(drop=True)#.sample(frac=1)

# Display the DataFrame
clara_df.plot.scatter(x='x',
                y='y',
                c='#157a30')

### Get optimal number of clusters

In [None]:
# Store the results
results = {'Number of Clusters':[],
           'Average Silhouette':[],
           'Average MSD':[]}

for k in range(2,10+1):
    # Instantiate and fit the clustering model
    # Replace 'CLARA' with 'PAM' if you want to use PAM instead
    clustering_model = CLARA(clara_df, k, num_samples=5)
    clustering_model.fit(verbose=0)

    # Evaluate the clustering
    silhouette_avg = clustering_model.sample_silhouette_avg
    msd_avg = clustering_model.sample_msd_avg
    # Store the results
    results['Number of Clusters'].append(k)
    results['Average Silhouette'].append(silhouette_avg)
    results['Average MSD'].append(msd_avg)

results_df=pd.DataFrame(results).set_index('Number of Clusters')
results_df

### Run clustering

In [None]:
k=3 #best choice based on metrics
clustering_model = CLARA(clara_df, k, num_samples=10)
clustering_model.fit(verbose=1)
clustering_model.generate_clara_report(file_name=f'Sph_Bivar_{k}clusters_CLARA_Report.md',save_markdown=True,path='./')

In [None]:
clustering_model.visualize_clusters(scale_factor=5)

In [None]:
clustering_model.plot_silhouette()

#### Cluster analysis

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_samples

def cluster_analytics(df, medoids_col='medoid', label_col='cluster_label'):
    # Initialize the DataFrame to store cluster analytics
    cluster_analytics = pd.DataFrame()

    # 1. Cluster size
    cluster_sizes = df[label_col].value_counts()
    cluster_analytics['Cluster Size'] = cluster_sizes

    # 2. Average silhouette per cluster
    silhouette_vals = silhouette_samples(df.drop([medoids_col, label_col], axis=1), df[label_col])
    df['silhouette'] = silhouette_vals
    avg_silhouette_scores = df.groupby(label_col)['silhouette'].mean()
    cluster_analytics['Avg Silhouette Score'] = avg_silhouette_scores

    # 3. Average distance from medoid within each cluster
    avg_distances = df.groupby(label_col).apply(lambda x: np.mean(euclidean_distances(x[~x[medoids_col]], [x[x[medoids_col]].iloc[0]])))
    cluster_analytics['Avg Distance from Medoid'] = avg_distances

    # 4. Max distance from the medoid over min distance to other medoids
    max_min_ratios = df.groupby(label_col).apply(lambda x: max(euclidean_distances(x[~x[medoids_col]], [x[x[medoids_col]].iloc[0]])) / min(euclidean_distances([x[x[medoids_col]].iloc[0]], df[df[medoids_col]].drop(x[x[medoids_col]].index))))
    cluster_analytics['Max/Min Distance Ratio'] = max_min_ratios.apply(lambda x: x[0])

    # 5. Add any other measure as required

    return cluster_analytics

In [None]:
enriched_dataset=clustering_model.enrich_dataset()
enriched_dataset.head(3)

In [None]:
cluster_analysis_df=cluster_analytics(enriched_dataset)
cluster_analysis_df