# Clustering process for outcomes

In [None]:
output_path
input_path

## Clustering process for outcomes

### Preparing for clustering

In [None]:
import pandas as pd
import json
import re

In [None]:
df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/3_exit/extract_policies_ML_concat.csv")

In [None]:
import pandas as pd
import json
import re

# Function to clean and parse JSON
def clean_and_parse_json(json_string):
    try:
        # Skip invalid strings
        if not json_string.strip().startswith("{"):
            return None
        # Remove trailing commas
        cleaned_string = re.sub(r",\s*}", "}", json_string)
        cleaned_string = re.sub(r",\s*]", "]", cleaned_string)
        # Parse the cleaned JSON string
        return json.loads(cleaned_string)
    except json.JSONDecodeError:
        return None

# Function to check if the JSON is meaningful
def is_meaningful_json(parsed_data):
    if not isinstance(parsed_data, dict):
        return False
    # Check if the JSON only contains "None" values
    for key, value in parsed_data.items():
        if key != "None" or (isinstance(value, dict) and any(k != "None" for k in value.keys())):
            return True
    return False

def extract_items(dataframe):
    new_rows = []
    for idx, row in dataframe.iterrows():
        extracted_data = row['extracted_features_and_correlations']
        
        # Skip rows with "No abstract" or invalid data
        if extracted_data == "No abstract" or not isinstance(extracted_data, str):
            continue
        
        # Clean and parse JSON content
        parsed_data = clean_and_parse_json(extracted_data)
        if parsed_data is None or not is_meaningful_json(parsed_data):
            print(f"Skipping non-meaningful JSON for index {idx}")
            continue
        
        geographic = parsed_data.get("GEOGRAPHIC", "None")
        policy_index = 0  # Initialize policy index for each row
        
        # Iterate through the items
        for policy, details in parsed_data.items():
            if policy == "GEOGRAPHIC":  # Skip the geographic key
                continue
            
            # Ensure details is a dictionary before accessing keys
            if not isinstance(details, dict):
                print(f"Skipping invalid details for policy {policy} at index {idx}: {details}")
                continue

            actor = details.get("ACTOR", "None")
            mode = details.get("MODE", "None")
            population = details.get("POPULATION", "None")
            factors = details.get("FACTOR", {})

            policy_index += 1  # Increment policy index
            factor_index = 0  # Initialize factor index for each policy

            # Iterate through the factors
            if isinstance(factors, dict):
                for factor, corr_details in factors.items():
                    correlation = corr_details.get("CORRELATION", "None")
                    factor_index += 1  # Increment factor index
                    new_rows.append({
                        'row_index': idx,  # Use original index as reference
                        'policy_index': policy_index,
                        'factor_index': factor_index,
                        'GEOGRAPHIC': geographic,
                        'POLICY': policy,
                        'ACTOR': actor,
                        'MODE': mode,
                        'POPULATION': population,
                        'FACTOR': factor,
                        'CORRELATION': correlation
                    })
            else:
                # If no factors, add a row for the policy only
                factor_index += 1  # Ensure factor index is still incremented
                new_rows.append({
                    'row_index': idx,
                    'policy_index': policy_index,
                    'factor_index': factor_index,
                    'GEOGRAPHIC': geographic,
                    'POLICY': policy,
                    'ACTOR': actor,
                    'MODE': mode,
                    'POPULATION': population,
                    'FACTOR': "None",
                    'CORRELATION': "None"
                })
    
    # Create new DataFrame
    new_df = pd.DataFrame(new_rows)

    # Set multi-level index
    new_df.set_index(['row_index', 'policy_index', 'factor_index'], inplace=True)

    return new_df

In [None]:
# Apply the extraction
extracted_items_df = extract_items(df)

In [None]:
def concatenate_columns(df):
    # Define a function to concatenate values if they are not "None"
    def concatenate(row):
        values = [row['POLICY'], row['MODE']]
        # Filter out "None" values and join with a space
        return " ".join(str(value) for value in values if value != "None")
    
    # Apply the function to each row and create a new column
    df['concatenated_policy'] = df.apply(concatenate, axis=1)
    return df

# Apply the function to the extracted_items_df
updated_df = concatenate_columns(extracted_items_df)

In [None]:
cluster_df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/4_clustering/2_modif_cluster.csv",sep=';').dropna()

In [None]:
import ast  # Import the ast module to parse string representations of lists

# Ensure the 'sentences' column in cluster_df is treated as a list
cluster_df['sentences'] = cluster_df['sentences'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Initialize an empty list to store results
results = []

# Iterate over each row in updated_df
for index, row in updated_df.iterrows():
    policy = row['concatenated_policy']
    matched_clusters = []  # Collect all matched clusters for the policy

    # Check each cluster in cluster_df
    for _, cluster_row in cluster_df.iterrows():
        # Ensure exact match of the policy in the cluster's sentences list
        if policy in cluster_row['sentences']:
            matched_clusters.append(cluster_row['Cluster Name'])
    
    # If matches are found, create duplicate rows
    if matched_clusters:
        for cluster_index, cluster in enumerate(matched_clusters, start=1):
            new_row = row.copy()  # Copy the original row
            new_row['matched_cluster'] = cluster
            new_row['cluster_index'] = cluster_index  # Assign integer value
            results.append(new_row)  # Add the new row to the results
    else:
        # If no match is found, add the original row with default cluster_index as 1
        row['matched_cluster'] = None
        row['cluster_index'] = 1  # Default index
        results.append(row)

# Create a new DataFrame from the results
expanded_df = pd.DataFrame(results)

# Ensure the cluster_index column is an integer type
expanded_df['cluster_index'] = expanded_df['cluster_index'].astype(int)

In [None]:
expanded_df = expanded_df.reset_index().rename(columns={"level_0":"row_index","level_1":"policy_index","level_2":"factor_index"})

In [None]:
# Save the updated dataset
output_path  = "" 
  # Update with your desired output path
expanded_df.to_csv(output_path, index=False)

### Filtering

In [None]:
expanded_df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/policy_cluster_factor_raw.csv" )

In [None]:
# Delete rows using drop()
expanded_df.drop(expanded_df[expanded_df['CORRELATION'] == 'None'].index, inplace = True)
expanded_df.drop(expanded_df[expanded_df['CORRELATION'] == 'none'].index, inplace = True)

expanded_df.dropna(subset=['matched_cluster'] , inplace = True)

In [None]:
list_decrease = [ 'competing', 'regressive', 'worsening']
list_increase = ['improving', 'enhancing', 'improving', 'increasing in Germany','increasing in Germany, decreasing in California', 'reproduced or disrupted', 'U-shaped']
list_neutral = ['changing', 'competing', 'impact', 'impacted', 'influenced', 'influencing', 'may increase or decrease', 'mixed', 'stable', 'sustaining', 'transformations', 'transforming', 'variability', 'varying']

expanded_df.loc[expanded_df['CORRELATION'].isin(list_decrease),'CORRELATION'] = 'decreasing'
expanded_df.loc[expanded_df['CORRELATION'].isin(list_increase),'CORRELATION'] = 'increasing'
expanded_df.loc[expanded_df['CORRELATION'].isin(list_neutral),'CORRELATION'] = 'neutral'

In [None]:
# Save the updated dataset
output_path  = "C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/1_policy_cluster_factor_filtered.csv" 
  # Update with your desired output path
expanded_df.to_csv(output_path, index=False)

### Clustering HDBSCAN

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import HDBSCAN
from joblib import Parallel, delayed
import numpy as np

In [None]:
policy_cluster_factor_filtered = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/1_policy_cluster_factor_filtered.csv" )

In [None]:
# Step 1: Initialize Smaller Model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2: Ensure Preprocessed Corpus has a Continuous Index
preprocessed_corpus = preprocessed_corpus.reset_index(drop=True)

# Convert to a list for parallel processing
corpus_list = preprocessed_corpus.tolist()

# Step 3: Batch Embedding Function
def embed_batch(batch):
    return embedder.encode(batch, show_progress_bar=False)

# Step 4: Generate Embeddings in Batches (Parallelized)
def parallel_embedding(corpus, batch_size=512):
    embeddings = Parallel(n_jobs=-1)(
        delayed(embed_batch)(corpus[i:i + batch_size])
        for i in range(0, len(corpus), batch_size)
    )
    return np.vstack(embeddings)

# Encode the corpus in parallel
batch_size = 512
corpus_embeddings = parallel_embedding(corpus_list, batch_size=batch_size)

# Step 5: Apply Dimensionality Reduction (PCA) Before Normalization
pca = PCA(n_components=50, random_state=42)
reduced_embeddings = pca.fit_transform(corpus_embeddings)
reduced_embeddings = normalize(reduced_embeddings)

# Step 6: Apply HDBSCAN Clustering
# HDBSCAN automatically determines the number of clusters
hdbscan_model = HDBSCAN(
    min_cluster_size=50,  # Minimum cluster size
    min_samples=10,        # Minimum samples in a neighborhood for a core point
    metric='euclidean',   # Distance metric
    cluster_selection_epsilon=0.5  # Adjust for fine-grained clustering
)
cluster_assignment = hdbscan_model.fit_predict(reduced_embeddings)

# Step 7: Analyze and Visualize Clusters
# HDBSCAN assigns -1 to noise points
num_clusters_found = len(set(cluster_assignment)) - (1 if -1 in cluster_assignment else 0)
print(f"Number of clusters found: {num_clusters_found}")

# Group sentences by cluster
clustered_sentences = [[] for _ in range(num_clusters_found)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id != -1:  # Exclude noise points
        clustered_sentences[cluster_id].append(corpus_list[sentence_id])

# Print clusters
for i, cluster in enumerate(clustered_sentences):
    print(f"Cluster {i + 1}:")
    print(cluster)
    print("")

### Clustering re-processing

In [None]:
import pandas as pd
import random

# Suppress duplicate sentences
unique_clustered_sentences = [
    list(set(cluster)) for cluster in clustered_sentences
]

# Create the DataFrame
data = []
for cluster_num, sentences in enumerate(unique_clustered_sentences):
    # Convert all sentences to strings
    sentences = [str(sentence) for sentence in sentences]
    
    # Get the number of sentences in the cluster
    num_sentences = len(sentences)
    
    # Randomly sample 10 sentences (or fewer if the cluster has less than 10 sentences)
    sample_sentences = random.sample(sentences, min(15, num_sentences))
    
    # Append the cluster info to the data list
    data.append({
        "Cluster Number": cluster_num + 1,
        "Number of Sentences": num_sentences,
        "Sample Sentences": "; ".join(sample_sentences)
    })

# Convert to DataFrame
cluster_summary_df = pd.DataFrame(data)

In [None]:
# Save the updated dataset
output_path  = "" 

  # Update with your desired output path
cluster_summary_df.to_csv(output_path, index=False)

In [None]:
# List of clusters to subdivide
clusters_to_subdivide = [4, 17]


# Function to subdivide clusters and create a new dataframe
def subdivide_clusters_to_new_dataframe(clustered_sentences, cluster_assignment, reduced_embeddings, clusters_to_subdivide):
    # Create a list for the final combined clusters
    combined_clusters = []
    new_subclusters = []  # To hold subdivided clusters

    # Add clusters that are not being subdivided to the final list
    for cluster_id, sentences in enumerate(clustered_sentences):
        if (cluster_id + 1) not in clusters_to_subdivide:  # Adjust for 1-based indexing in `clusters_to_subdivide`
            combined_clusters.append({"cluster_id": cluster_id + 1, "sentences": sentences})

    # Subdivide the specified clusters
    for cluster_index in clusters_to_subdivide:
        # Adjust index for 0-based indexing (Python lists)
        cluster_id = cluster_index - 1

        # Extract the embeddings and sentences for the current cluster
        indices = [i for i, cid in enumerate(cluster_assignment) if cid == cluster_id]
        if len(indices) < 5:  # HDBSCAN needs at least a few points
            continue

        cluster_embeddings = reduced_embeddings[indices]
        cluster_sentences = [preprocessed_corpus[i] for i in indices]

        # Apply HDBSCAN to subdivide the cluster
        # Apply HDBSCAN to subdivide the cluster
        hdbscan_model = HDBSCAN(
            min_cluster_size=5,  # Minimum cluster size
            min_samples=5,        # Minimum samples in a neighborhood for a core point
            metric='euclidean',   # Distance metric
            cluster_selection_epsilon=0.45  # Adjust for fine-grained clustering
            )
        hdbscan_labels = hdbscan_model.fit_predict(cluster_embeddings)

        # Map each HDBSCAN cluster to the combined list
        for hdbscan_cluster_id in set(hdbscan_labels):
            if hdbscan_cluster_id == -1:  # Skip noise
                continue
            new_subclusters.append(
                {
                    "cluster_id": f"{cluster_index}-{hdbscan_cluster_id}",
                    "sentences": [cluster_sentences[i] for i, label in enumerate(hdbscan_labels) if label == hdbscan_cluster_id],
                }
            )

    # Append subdivided clusters to the remaining clusters
    combined_clusters.extend(new_subclusters)

    # Convert the combined clusters into a dataframe
    new_cluster_df = pd.DataFrame(combined_clusters)
    return new_cluster_df

# Subdivide selected clusters and create a new dataframe
new_cluster_df = subdivide_clusters_to_new_dataframe(
    clustered_sentences, cluster_assignment, reduced_embeddings, clusters_to_subdivide
)

# Display the new dataframe
print(new_cluster_df)

In [None]:
# Save the updated dataset
output_path  = "C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/4_clustering/1_outcomes/extract_clusters_subdivide.csv" 

  # Update with your desired output path
new_cluster_df.to_csv(output_path, index=False)

In [None]:
def suppress_and_merge_clusters(cluster_df, clusters_to_suppress, clusters_to_merge):
    """
    Suppress and merge clusters in a DataFrame.

    Parameters:
    - cluster_df: DataFrame containing cluster information.
    - clusters_to_suppress: List of cluster IDs to suppress.
    - clusters_to_merge: Dictionary where keys are clusters to keep, and values are lists of clusters to merge into them.

    Returns:
    - Updated DataFrame with suppressed and merged clusters.
    """
    # Suppress clusters
    suppressed_df = cluster_df[~cluster_df["cluster_id"].isin(clusters_to_suppress)]

    # Merge clusters
    for target_cluster, clusters_to_merge_into in clusters_to_merge.items():
        # Find the sentences for all clusters to merge
        sentences_to_merge = []
        for merge_cluster in clusters_to_merge_into:
            merge_rows = suppressed_df[suppressed_df["cluster_id"] == merge_cluster]
            if not merge_rows.empty:
                sentences_to_merge.extend(merge_rows.iloc[0]["sentences"])
        
        # Append sentences to the target cluster
        target_row = suppressed_df[suppressed_df["cluster_id"] == target_cluster]
        if not target_row.empty:
            target_row_index = target_row.index[0]
            suppressed_df.at[target_row_index, "sentences"] = (
                suppressed_df.at[target_row_index, "sentences"] + sentences_to_merge
            )
        
        # Remove merged clusters
        suppressed_df = suppressed_df[~suppressed_df["cluster_id"].isin(clusters_to_merge_into)]
    
    # Reset the index for a clean DataFrame
    suppressed_df.reset_index(drop=True, inplace=True)
    return suppressed_df


# Define clusters to suppress
clusters_to_suppress = [
    1, 8,"17-1", "29-3", "86-2"
]

# Define clusters to merge
clusters_to_merge = {
    18: [56],
    82: ["4-2","4-4","54-6","61-2","89-0","44-1"]
}

# Apply suppression and merging
cleaned_cluster_df = suppress_and_merge_clusters(new_cluster_df, clusters_to_suppress, clusters_to_merge)

# Display the updated DataFrame
print(cleaned_cluster_df)

In [None]:
def remove_duplicate_sentences(cluster_df):
    """
    Removes duplicate sentences within each row of the DataFrame.
    
    Parameters:
    - cluster_df: DataFrame containing cluster information with a 'sentences' column.

    Returns:
    - Updated DataFrame with unique sentences in each cluster.
    """
    # Ensure each row's "sentences" list contains only unique values
    cluster_df["sentences"] = cluster_df["sentences"].apply(lambda x: list(set(x)))
    return cluster_df


# Apply the function to remove duplicates
cleaned_cluster_df = remove_duplicate_sentences(cleaned_cluster_df)

# Display the updated DataFrame
print(cleaned_cluster_df)

In [None]:
def assign_cluster_names(cluster_df, cluster_name_mapping):
    """
    Assign cluster names based on the provided mapping.

    Parameters:
    - cluster_df: DataFrame containing clusters.
    - cluster_name_mapping: Dictionary mapping cluster_id to cluster names.

    Returns:
    - Updated DataFrame with a new column for cluster names.
    """
    cluster_df = cluster_df.copy()  # Avoid modifying the original DataFrame

    # Assign names using the mapping
    cluster_df["Cluster Name"] = cluster_df["cluster_id"].map(cluster_name_mapping)

    # Fill missing names with "Unnamed Cluster"
    cluster_df["Cluster Name"].fillna("Unnamed Cluster", inplace=True)

    return cluster_df


# Define the cluster name mapping (shortened for brevity; use the full mapping provided)
cluster_name_mapping = {
    18: "Access, Level and Quality of Service",
    78: "Accessibility"
}

# Apply the function to assign names
named_cluster_df = assign_cluster_names(cleaned_cluster_df, cluster_name_mapping)

# Display the updated DataFrame
print(named_cluster_df)

In [None]:
# Save the updated dataset
output_path  = "C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/4_clustering/1_outcomes/1_extract_clusters_named.csv" 

  # Update with your desired output path
named_cluster_df.to_csv(output_path, index=False)

## Meta clustering

In [None]:
updated_df_raw = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/policy_cluster_factor_raw.csv"  )
updated_df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/3_policy_and_factors_clustered.csv"  )
cluster_df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/4_clustering/2_modif_cluster.csv", sep=";" )

In [None]:
updated_df=updated_df.dropna(subset='matched_cluster')
updated_df = pd.merge(updated_df, cluster_df[['Cluster Name','Agg Cluster']], how= 'left', left_on= 'matched_cluster', right_on= 'Cluster Name')
grouped = updated_df.groupby(["Agg Cluster","Cluster Name"])["matched_cluster"].count()

In [None]:
# Save the updated dataset
output_path = "C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/6_visuals/radial_tree.csv" 

# Update with your desired output path
grouped.reset_index().to_csv(output_path, index=False)