# Sufficiency index

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

## Sufficiency index calculus

In [None]:
df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/5_policy_and_factors_clustered_prevalence_normalized_final.csv")

df['correlation_cluster_normalized_global'] = df['Corr Sign']*df['policy_similarity_normalized_global']*df['CORRELATION_num']

# Aggregating data: calculating mean, count, and standard deviation
aggregated_result = df.groupby(['matched_cluster', 'Agg Cluster_factor', 'Agg ClusterFalse']).agg(
    correlation_mean_cluster_factor=('correlation_cluster_normalized_global', 'mean'),
    row_count=('correlation_cluster_normalized_global', 'count'),
    correlation_std_dev=('correlation_cluster_normalized_global', 'std')
).reset_index()

In [None]:
df = aggregated_result

# List of Agg Cluster_factors that need inversion
invert_factors = [
    "Built Environment and Land-Use",
    "Energy and Resource Demand",
    "Environmental Impact and Externalities",
    "GHG and Particulate Emission",
    "Mobility Demand",
    "Private Vehicle Demand"
]

# Step A1: Invert correlation_mean_cluster_factor for specific Agg Cluster_factors
df['New_indice_1'] = df['correlation_mean_cluster_factor']
df.loc[df['Agg Cluster_factor'].isin(invert_factors), 'New_indice_1'] *= -1

# Step A2: Rescale New_indice_2 between 0 and 1 for each Agg Cluster_factor
min_max = df.groupby('Agg Cluster_factor')['New_indice_1'].agg(['min', 'max']).reset_index()
min_max = min_max.rename(columns={'min': 'min_value', 'max': 'max_value'})
df = pd.merge(df, min_max, on='Agg Cluster_factor', how='left')
df['New_indice_2'] = (df['New_indice_1'] - 1.15*df['min_value']) / (1.1*df['max_value'] - 1.15*df['min_value'])

# Step B: Compute the indices
# Define categories for ECOLO_indice calculations
ecolo_factors = [
    "Active Travel Demand and Public Transport Use",
    "Built Environment and Land-Use",
    "Energy and Resource Demand",
    "Environmental Impact and Externalities",
    "GHG and Particulate Emission",
    "Mobility Demand",
    "Private Vehicle Demand"
]

socio_factors = [
    "Accessibility and Transport Service",
    "Economic Outcomes, Transport and Housing Affordability",
    "Health and Safety",
    "Social Equity and Equality",
    "Well-Being and Quality of Life"
]

# Calculate geometric mean for each matched_cluster
def geometric_mean(series):
    return np.exp(np.mean(np.log(series + 1e-9)))  # Add small value to avoid log(0)

# ECOLO_indice_1
indice_1_ecolo = df[df['Agg Cluster_factor'].isin(ecolo_factors) & (df['New_indice_1'].notna()) & (df['row_count']>1)].groupby('matched_cluster')['New_indice_1'].mean().reset_index()
indice_1_ecolo = indice_1_ecolo.rename(columns={'New_indice_1': 'ECOLO_indice_1'})

# SOCIO_indice_1
indice_1_socio = df[df['Agg Cluster_factor'].isin(socio_factors) & (df['New_indice_1'].notna()) & (df['row_count']>1)].groupby('matched_cluster')['New_indice_1'].mean().reset_index()
indice_1_socio = indice_1_socio.rename(columns={'New_indice_1': 'SOCIO_indice_1'})

# ECOLO_indice
indice_2_ecolo = df[df['Agg Cluster_factor'].isin(ecolo_factors) & (df['New_indice_2'].notna()) & (df['row_count']>1)].groupby('matched_cluster')['New_indice_2'].apply(geometric_mean).reset_index()
indice_2_ecolo = indice_2_ecolo.rename(columns={'New_indice_2': 'ECOLO_indice_2'})

# SOCIO_indice
indice_2_socio = df[df['Agg Cluster_factor'].isin(socio_factors) & (df['New_indice_2'].notna()) & (df['row_count']>1)].groupby('matched_cluster')['New_indice_2'].apply(geometric_mean).reset_index()
indice_2_socio = indice_2_socio.rename(columns={'New_indice_2': 'SOCIO_indice_2'})


# Merge indices back to the original DataFrame
result = indice_1_ecolo 
result = pd.merge(result, indice_1_socio, on='matched_cluster', how='left')
result = pd.merge(result, indice_2_ecolo, on='matched_cluster', how='left')
result = pd.merge(result, indice_2_socio, on='matched_cluster', how='left')

## Bubble graph

In [None]:
df = pd.read_csv("C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/5_final_db/5_policy_and_factors_clustered_prevalence_normalized_final.csv" )

# Count the number of rows per 'matched_cluster' (size of the bubbles)
result = pd.merge(result,df.groupby('matched_cluster')['index'].agg(['count']).reset_index(),how='left', left_on='matched_cluster',right_on='matched_cluster')

In [None]:
matplotlib.rcParams['font.family'] = 'Times New Roman'


# Create the bubble graph using seaborn
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='ECOLO_indice_2', 
    y='SOCIO_indice_2', 
    size='count', 
    sizes=(100, 1000),  # Adjust bubble size range
    alpha=0.6, 
    data=result
)

# Add labels to the first 10 points only for clarity
for i, row in result.head(0).iterrows():
#for i, row in result.iterrows():
    plt.text(
        row['ECOLO_indice_2'], 
        row['SOCIO_indice_2'], 
        row['matched_cluster'][:15] + '...' if len(row['matched_cluster']) > 15 else row['matched_cluster'], 
        fontsize=9, 
        ha='right'
    )


# Calculate median values for Upper Limit and Lower Limit indices
ecolo_median = result['ECOLO_indice_2'].quantile(q=0.2)
socio_median = result['SOCIO_indice_2'].quantile(q=0.2)

# Draw median lines
plt.axvline(x=ecolo_median, color='grey', linestyle='--', label=f'Upper Limit 1st Quintile: {ecolo_median:.2f}')
plt.axhline(y=socio_median, color='black', linestyle='--', label=f'Lower Limit 1st Quintile: {socio_median:.2f}')

# Label axes and title
#plt.title("Bubble Graph of ECOLO indice and SOCIO_indice", fontsize=14)
plt.xlabel("Upper Limit Index", fontsize=12)
plt.ylabel("Lower Limit Index", fontsize=12)

# Adjust legend position
plt.legend(
    bbox_to_anchor=(1.05, 1), 
    loc='upper left', 
    #title="Agg Cluster",
    labelspacing=0.9,  # Increase spacing between legend items
    #borderaxespad=0.5  # Adjust padding between legend and axes
)

# Show grid
plt.grid(True, linestyle='--', alpha=0.7)

# Show the plot
plt.tight_layout()
plt.show()

## Export data

In [None]:
# Save the updated dataset
output_path = "C:/Users/easycash/Mon Drive/Thèse/1_Systematic mapping/6_structural_topic_model/6_visuals/bubble_graph.csv" 

# Update with your desired output path
result.to_csv(output_path, index=False)

## Select per quantile

In [None]:
# Choose the right list
list = result[(result['ECOLO_indice_2'] <= ecolo_median) & (result['SOCIO_indice_2'] > socio_median)]['matched_cluster'].values