In [1]:
import pandas as pd 
import os 

In [7]:
# import os
# import pandas as pd

# def extract_silhouette_scores(base_path):
#     """
#     Extract silhouette scores from the specified directories and create a DataFrame with the results.
    
#     Parameters:
#     base_path (str): The base path where the result folders are stored.
    
#     Returns:
#     pd.DataFrame: DataFrame containing dataset names, cluster numbers, and silhouette scores.
#     """
#     results = []

#     # Walk through the directories
#     for root, dirs, files in os.walk(base_path):
#         for file in files:
#             if file == "silhouette_score.txt":
#                 # Extract the cluster number and dataset name from the directory structure
#                 parts = root.split('/')
            
#                 cluster_num = parts[-2].split('_')[-1]
#                 dataset_name = parts[-2].split('_')[0]
                
#                 # Read the silhouette score from the file
#                 score_file_path = os.path.join(root, file)
#                 with open(score_file_path, 'r') as f:
#                     score_line = f.read().strip()
#                     score = float(score_line.split(': ')[1])  # Extract the numeric value
                
#                 # Append the result to the list
#                 results.append({
#                     'Dataset': dataset_name,
#                     'Cluster_Num': int(cluster_num),
#                     'Silhouette_Score': score
#                 })

#     # Create a DataFrame from the results
#     df = pd.DataFrame(results)
#     return df

# # Example usage
# base_path = '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/citycl'
# silhouette_scores_df = extract_silhouette_scores(base_path)

# # Display the DataFrame
# print(silhouette_scores_df)


In [26]:

def extract_clustering_metrics(base_path):
    results = []

    # Walk through the directories
    for root, dirs, files in os.walk(base_path):
        if 'silhouette_score.txt' in files and 'davies_bouldin_score.txt' in files and 'calinski_harabasz_score.txt' in files:
            # Extract the cluster number and dataset name from the directory structure
            parts = root.split('/')
        
            cluster_num = parts[-1].split('_')[-1]
            dataset_name = parts[-2].split('_')[0]
            
            # Read the silhouette score from the file
            with open(os.path.join(root, 'silhouette_score.txt'), 'r') as f:
                silhouette_score = float(f.read().strip().split(': ')[1])
            
            # Read the Davies-Bouldin score from the file
            with open(os.path.join(root, 'davies_bouldin_score.txt'), 'r') as f:
                davies_bouldin_score = float(f.read().strip().split(': ')[1])
            
            # Read the Calinski-Harabasz score from the file
            with open(os.path.join(root, 'calinski_harabasz_score.txt'), 'r') as f:
                calinski_harabasz_score = float(f.read().strip().split(': ')[1])
            
            # Append the result to the list
            results.append({
                'Dataset': dataset_name,
                'Cluster_Num': int(cluster_num),
                'Silhouette_Score': silhouette_score,
                'Davies_Bouldin_Index': davies_bouldin_score,
                'Calinski_Harabasz_Index': calinski_harabasz_score
            })

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df

def find_best_performers(df):
    best_silhouette = df.loc[df['Silhouette_Score'].idxmax()]
    best_davies_bouldin = df.loc[df['Davies_Bouldin_Index'].idxmin()]
    best_calinski_harabasz = df.loc[df['Calinski_Harabasz_Index'].idxmax()]

    best_performers = pd.DataFrame({
        'Metric': ['Silhouette Score', 'Davies-Bouldin Index', 'Calinski-Harabasz Index'],
        'Best Performer': [best_silhouette['Dataset'], best_davies_bouldin['Dataset'], best_calinski_harabasz['Dataset']],
        'Cluster_Num': [best_silhouette['Cluster_Num'], best_davies_bouldin['Cluster_Num'], best_calinski_harabasz['Cluster_Num']],
        'Score': [best_silhouette['Silhouette_Score'], best_davies_bouldin['Davies_Bouldin_Index'], best_calinski_harabasz['Calinski_Harabasz_Index']]
    })

    return best_performers



In [28]:

# Example usage
base_path = '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/citycl'
# this uses data with gas filter
base_path = '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/cityfinalv2'
# this uses all data not inc gas filter 
base_path = '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/v0_citycluster_all' 
clustering_metrics_df = extract_clustering_metrics(base_path)

find_best_performers(clustering_metrics_df)

Unnamed: 0,Metric,Best Performer,Cluster_Num,Score
0,Silhouette Score,V49,5,0.458086
1,Davies-Bouldin Index,V51,8,0.700786
2,Calinski-Harabasz Index,V49,5,164.45614


In [33]:
base_path = '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/v3_citycluster_all' 
clustering_metrics_df = extract_clustering_metrics(base_path)

find_best_performers(clustering_metrics_df)

Unnamed: 0,Metric,Best Performer,Cluster_Num,Score
0,Silhouette Score,V55,9,0.456501
1,Davies-Bouldin Index,V55,9,0.660383
2,Calinski-Harabasz Index,V55,9,161.01892


In [34]:
clustering_metrics_df[clustering_metrics_df['Dataset'].isin(['V55'])][clustering_metrics_df['Cluster_Num']==8 ]

  clustering_metrics_df[clustering_metrics_df['Dataset'].isin(['V55'])][clustering_metrics_df['Cluster_Num']==8 ]


Unnamed: 0,Dataset,Cluster_Num,Silhouette_Score,Davies_Bouldin_Index,Calinski_Harabasz_Index
9,V55,8,0.412886,0.684192,124.667069


In [15]:
clustering_metrics_df

Unnamed: 0,Dataset,Cluster_Num,Silhouette_Score,Davies_Bouldin_Index,Calinski_Harabasz_Index
0,cityfinalv2,5,0.411084,0.821578,126.587844
1,V41,5,0.388670,0.764450,117.202044
2,V41,6,0.355263,0.860230,90.700165
3,V41,7,0.352821,0.795240,101.272603
4,V41,8,0.357572,0.753416,99.118520
...,...,...,...,...,...
112,V61,5,0.435348,0.733885,105.658366
113,V61,6,0.398794,0.720636,94.792881
114,V61,7,0.319679,0.789494,74.922197
115,V61,8,0.378974,0.816748,88.275557


In [9]:

def extract_clustering_metrics(base_path):
    results = []

    # Walk through the directories
    for root, dirs, files in os.walk(base_path):
        if 'silhouette_score.txt' in files and 'davies_bouldin_score.txt' in files and 'calinski_harabasz_score.txt' in files:
            # Extract the cluster number and dataset name from the directory structure
            parts = root.split('/')
        
            cluster_num = parts[-1].split('_')[-1]
            dataset_name = parts[-1].split('_')[0]
            
            # Read the silhouette score from the file
            with open(os.path.join(root, 'silhouette_score.txt'), 'r') as f:
                silhouette_score = float(f.read().strip().split(': ')[1])
            
            # Read the Davies-Bouldin score from the file
            with open(os.path.join(root, 'davies_bouldin_score.txt'), 'r') as f:
                davies_bouldin_score = float(f.read().strip().split(': ')[1])
            
            # Read the Calinski-Harabasz score from the file
            with open(os.path.join(root, 'calinski_harabasz_score.txt'), 'r') as f:
                calinski_harabasz_score = float(f.read().strip().split(': ')[1])
            
            # Append the result to the list
            results.append({
                'Dataset': dataset_name,
                'Cluster_Num': int(cluster_num),
                'Silhouette_Score': silhouette_score,
                'Davies_Bouldin_Index': davies_bouldin_score,
                'Calinski_Harabasz_Index': calinski_harabasz_score
            })

    # Create a DataFrame from the results
    df = pd.DataFrame(results)
    return df


base_path= '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/cityfinal'
base_path= '/Volumes/T9/Data_downloads/new-data-outputs/ml_results/cityfinalv2'
clustering_metrics_df = extract_clustering_metrics(base_path)

find_best_performers(clustering_metrics_df)

Unnamed: 0,Metric,Best Performer,Cluster_Num,Score
0,Silhouette Score,V49,5,0.450115
1,Davies-Bouldin Index,V51,8,0.673377
2,Calinski-Harabasz Index,V49,5,156.765394


In [6]:
clustering_metrics_df[clustering_metrics_df['Dataset'].isin(['V49', 'V46'])]

Unnamed: 0,Dataset,Cluster_Num,Silhouette_Score,Davies_Bouldin_Index,Calinski_Harabasz_Index
46,V46,5,0.415272,0.741928,129.117304
47,V46,6,0.412868,0.794232,116.830581
48,V46,7,0.384714,0.765278,116.454251
49,V46,8,0.424395,0.67671,137.596403
50,V46,9,0.413697,0.813928,128.68292
51,V46,10,0.428521,0.805634,140.433839
64,V49,5,0.450115,0.744905,156.765394
65,V49,6,0.360556,0.883149,96.973267
66,V49,7,0.344211,0.837511,98.249069
67,V49,8,0.376594,0.820792,103.824158


In [37]:
clustering_metrics_df[clustering_metrics_df['Dataset'].isin(['v81city', 'v83city'])  ]

Unnamed: 0,Dataset,Cluster_Num,Silhouette_Score,Davies_Bouldin_Index,Calinski_Harabasz_Index
45,v83city,3,0.467493,0.731212,122.891593
46,v83city,5,0.477856,0.710944,156.826609
47,v83city,6,0.426387,0.814815,114.641046
48,v83city,7,0.432674,0.754352,118.603521
49,v83city,8,0.429977,0.697875,116.578673
50,v83city,9,0.475069,0.67657,149.736873
51,v83city,10,0.454011,0.751273,147.459995
52,v83city,11,0.394518,0.830874,127.382653
53,v83city,12,0.396611,0.794927,114.314504
72,v81city,3,0.433633,0.815181,107.466241


In [35]:
clustering_metrics_df

Unnamed: 0,Dataset,Cluster_Num,Silhouette_Score,Davies_Bouldin_Index,Calinski_Harabasz_Index
0,v85city,3,0.482365,0.721360,129.361402
1,v85city,5,0.421443,0.732061,117.825301
2,v85city,6,0.390060,0.858503,107.007205
3,v85city,7,0.447194,0.744901,128.968324
4,v85city,8,0.399014,0.717747,119.976485
...,...,...,...,...,...
105,v94city,6,0.327515,0.804331,79.387407
106,v94city,7,0.310446,0.805193,78.026972
107,v94city,8,0.343086,0.775736,94.998218
108,v94city,9,0.342970,0.823166,93.311294


Unnamed: 0,Metric,Best Performer,Cluster_Num,Score
0,Silhouette Score,v85city,3,0.484085
1,Davies-Bouldin Index,v86city,8,0.713608
2,Calinski-Harabasz Index,v87city,5,143.65272


In [23]:
clustering_metrics_df.sort_values(['Calinski_Harabasz_Index'])

Unnamed: 0,Dataset,Cluster_Num,Silhouette_Score,Davies_Bouldin_Index,Calinski_Harabasz_Index
17,v86city,12,0.299912,0.857593,80.2356
44,v89city,12,0.287468,0.908837,87.945543
16,v86city,11,0.291735,0.874291,94.482732
7,v85city,11,0.310003,0.868123,95.432856
35,v88city,12,0.301331,0.843349,97.590807
8,v85city,12,0.353033,0.843369,99.63643
25,v87city,11,0.336905,0.819797,101.023571
13,v86city,8,0.368669,0.713608,101.088907
30,v88city,7,0.365989,0.818829,101.426815
14,v86city,9,0.35649,0.8209,101.636124


In [None]:
v86city  8 