## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import json 

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
categorized_questions_file = '../categorized_questions.jsonl'

# read categorized questions
categorized_questions = []
with open(categorized_questions_file, 'r') as f:
    for line in f:
        categorized_questions.append(json.loads(line))

# check if common_questions are same as categorized_questions
categorized_questions_list = [list(q.values())[0] for q in categorized_questions]

# create a dictionary to map questions to their categories
question_to_category = {q['Question']: q['Category'] for q in categorized_questions}

In [4]:
# all unique categories in question_to_category
unique_categories = sorted(set(question_to_category.values()))
print(f"Lenght of unique categories: {len(unique_categories)}")
print(unique_categories)

Lenght of unique categories: 15
['A. Social values and attitudes', 'B. Religion and spirituality', 'C. Science and technology', 'D. Politics and policy', 'E. Demographics', 'G. International affairs', 'I. Gender and LGBTQ', 'J. News habits and media', 'K. Immigration and migration', 'L. Family and relationships', 'M. Race and ethnicity', 'N. Economy and work', 'O. Regions and countries', 'P. Methodological research', 'Q. Security']


## Load Dataset

#### Load of the CSV Data
- CSV Data: CSV Files for 17 countries
- Dollarstreet data is excluded because it is across more countries.

In [5]:
from scipy.spatial import distance
import ast

# List all files in the directory
directory = "/home/vsl333/cultural_values/notebooks/outputs"
country_image_file_list = ['cvqa_wvs_metadata_llava-v1.6-vicuna-13b_True_results.csv', 'cvqa_wvs_metadata_llava-v1.6-vicuna-13b_False_results.csv',
                            # 'cvqa_wvs_metadata_llava-v1.6-34b_True_results.csv', 'cvqa_wvs_metadata_llava-v1.6-34b_False_results.csv']
                            'cvqa_wvs_metadata_llava-next-72b-hf_True_results.csv', 'cvqa_wvs_metadata_llava-next-72b-hf_False_results.csv',]
print(country_image_file_list)

['cvqa_wvs_metadata_llava-v1.6-vicuna-13b_True_results.csv', 'cvqa_wvs_metadata_llava-v1.6-vicuna-13b_False_results.csv', 'cvqa_wvs_metadata_llava-v1.6-34b_True_results.csv', 'cvqa_wvs_metadata_llava-v1.6-34b_False_results.csv']


### Map question to thieir question category

#### Create a combioned dataframe for both country and image level data
- Remove all rows where sum <98
- All 4 dataframes should have data for same country-image pairs
- Concatenate image and text dataframes
- 'Image' colun will be True for Image level data and False for Country level data
- Compute JSD and Similarity for each question per countriy (across all categories)in 'jsd' and 'similarity' columns
- JSD: Jensen Shannon Divergence, Similarity is computed as 1 - JSD
- 'coutry_mean_similarity' and 'country_mean_jsd' columns have mean similarity and JSD values for each country

In [6]:
results = []
all_incorrect_df = pd.DataFrame()
for idx, each_file in enumerate(country_image_file_list):
    
    # 'True' => Country was in the prompt, 'False' => Country was not in the prompt, only image was used
    has_image = 'True' if 'False' in each_file else 'False' if 'True' in each_file else print("Missing: Neither 'True' nor 'False' is present in the file name")
    modelsize = '13b' if '13b' in each_file else '34b' if '34b' in each_file else '72b' if '72b' in each_file else print("Missing: Model size is not present in the file name")
    
    print(f"Processing file: {each_file}", f"Has Image: {has_image}", f"Model Size: {modelsize}")
    each_data = pd.read_csv(os.path.join(directory, each_file))
    data = each_data.copy()

    # Add category column to the DataFrame
    breakpoint()
    # data['img_category'] = data['image_path'].apply(lambda x: x.split('/')[-1].split('.')[0].split('_')[1:-2])
    data['img_category'] = data['image_path'].apply(lambda x: x.split('/')[-2:-1][0])

    # Example gt and pred (list of lists)
    gt = [ast.literal_eval(x) for x in data['selection_answers'].tolist()] # there are prob distrbutions of human answers
    pred = [ast.literal_eval(x) for x in data['prob_percent_values'].tolist()] # there are prob distrbutions of model answers

    # Original DataFrame, retaining all columns from `data`
    df = data.copy()

    # Keep track of matching indices
    matching_indices = [i for i, (g, p) in enumerate(zip(gt, pred)) if len(g) == len(p)]

    # Filter rows where len(g) == len(p)
    filtered_gt_pred = [(g, p) for g, p in zip(gt, pred) if len(g) == len(p)]

    # If there are any matching rows
    if filtered_gt_pred:
        gt_filtered, pred_filtered = zip(*filtered_gt_pred)

        # Calculate JSD
        jsd = [distance.jensenshannon(g, p) for g, p in zip(gt_filtered, pred_filtered)]
        similarity = [1 - value for value in jsd]

        try:
            # Add JSD and similarity values back to the DataFrame
            df.loc[matching_indices, 'jsd'] = jsd
            df.loc[matching_indices, 'similarity'] = similarity
        except Exception as e:
            print(f"Error processing JSD and similarity for indices {matching_indices}: {e}")
            breakpoint()
    else:
        df['jsd'] = None  # or handle the case accordingly
        df['similarity'] = None

    # Set JSD and similarity to NaN or a placeholder for rows that were filtered out
    df['jsd'] = pd.to_numeric(df['jsd'], errors='coerce')
    df['similarity'] = pd.to_numeric(df['similarity'], errors='coerce')

    df['Image'] = 'False' if has_image == 'False' else 'True' if has_image == 'True' else print("Something missing! Image is not True or False")
    df['model_size'] = modelsize

    # Calculate mean of jsd and similarity per country per 'Image' column and add new col 'mean_jsd' and 'mean_similarity'
    df['country_mean_jsd'] = df.groupby(['country', 'Image'])['jsd'].transform('mean')
    df['coutry_mean_similarity'] = df.groupby(['country', 'Image'])['similarity'].transform('mean')
        
    # Append results to list
    results.append(df)

# Combine all results into a single DataFrame
final_results = pd.concat(results)
print(f"Pre filtered shape: {final_results.shape}")



Processing file: cvqa_wvs_metadata_llava-v1.6-vicuna-13b_True_results.csv Has Image: False Model Size: 13b
Processing file: cvqa_wvs_metadata_llava-v1.6-vicuna-13b_False_results.csv Has Image: True Model Size: 13b
Processing file: cvqa_wvs_metadata_llava-v1.6-34b_True_results.csv Has Image: False Model Size: 34b
Processing file: cvqa_wvs_metadata_llava-v1.6-34b_False_results.csv Has Image: True Model Size: 34b
Pre filtered shape: (60756, 25)


In [7]:
# drop any rows with NaN values
final_results = final_results.dropna()
print(f"Post filtered shape: {final_results.shape}")

Post filtered shape: (60440, 25)


In [8]:
# show all rows where jsd is NaN, display selection_answers and prob_percent_values
final_results[final_results['jsd'].isnull()][['selection_answers', 'prob_percent_values', 'question_text', 'country']]

Unnamed: 0,selection_answers,prob_percent_values,question_text,country


In [9]:
# show all rows in final_result where sum_prob_percent_sorted < 99
mnp = final_results[final_results['sum_prob_percent_sorted'] < 99][['image_path', 'prompt', 'sum_prob_percent_sorted', 'Image', 'model_size', 'img_category', 'jsd', 'similarity']]
print(mnp[['Image', 'model_size']].value_counts())

pd.set_option('display.max_colwidth', 300)
mnp.groupby(['Image', 'model_size','image_path']).count()


Series([], Name: count, dtype: int64)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,prompt,sum_prob_percent_sorted,img_category,jsd,similarity
Image,model_size,image_path,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


#### Filter rows where sum_prob_percent_sorted <99


In [10]:
print(f"Size of the total data: {final_results.shape}")
# Filter rows where sum_prob_percent_sorted <99
filtered_final_result = final_results[final_results['sum_prob_percent_sorted'] > 99]
print(f"Size of the filtered data: {filtered_final_result.shape}")

Size of the total data: (60440, 25)
Size of the filtered data: (60440, 25)


### Add 'question_topic' to both dataframes
- 'question_topic' is the column which has each question mapped to a broad topic
- Broad topics are: {'B. Religion and spirituality', 'E. Demographics', 'A. Social values and attitudes', 'D. Politics and policy', 'O. Regions and countries', 'J. News habits and media', 'G. International affairs', 'M. Race and ethnicity', 'N. Economy and work', 'P. Methodological research', 'C. Science and technology', 'K. Immigration and migration', 'L. Family and relationships', 'Q. Security', 'I. Gender and LGBTQ'}

Country Order

In [11]:
filtered_final_result['question_topic'] = filtered_final_result['question_text'].map(question_to_category)
filtered_final_result.columns

Index(['img_id', 'image_path', 'country', 'image_code', 'income',
       'question_text', 'country_prompt', 'generic_prompt', 'option_labels',
       'full_options', 'prompt', 'options', 'top10_token_prob',
       'prob_percent_sorted', 'sum_prob_percent_sorted', 'prob_percent_keys',
       'prob_percent_values', 'selection_answers', 'img_category', 'jsd',
       'similarity', 'Image', 'model_size', 'country_mean_jsd',
       'coutry_mean_similarity', 'question_topic'],
      dtype='object')

In [12]:
# count rows where nan values are present
filtered_final_result.isna().sum()

img_id                     0
image_path                 0
country                    0
image_code                 0
income                     0
question_text              0
country_prompt             0
generic_prompt             0
option_labels              0
full_options               0
prompt                     0
options                    0
top10_token_prob           0
prob_percent_sorted        0
sum_prob_percent_sorted    0
prob_percent_keys          0
prob_percent_values        0
selection_answers          0
img_category               0
jsd                        0
similarity                 0
Image                      0
model_size                 0
country_mean_jsd           0
coutry_mean_similarity     0
question_topic             0
dtype: int64

In [13]:
rich_countries = ["United States", "France", "South Korea", "Italy"]
medium_countries = ["Brazil", "Mexico", "China"]
poor_countries = ["Pakistan", "Nigeria", "Vietnam"]

country_list = rich_countries + medium_countries + poor_countries

#### Separate by  question categories

In [14]:
unique_countries = filtered_final_result['country'].unique()
unique_countries

array(['Brazil', 'China', 'France', 'Italy', 'Mexico', 'Nigeria',
       'Pakistan', 'South Korea', 'United States', 'Vietnam'],
      dtype=object)

In [15]:

results_country_order = [country for country in country_list if country in unique_countries]
results_country_order

['United States',
 'France',
 'South Korea',
 'Italy',
 'Brazil',
 'Mexico',
 'China',
 'Pakistan',
 'Nigeria',
 'Vietnam']

### Compute Marginal Distribution

Func for marginal distribution calculation

In [16]:
def average_distributions(distributions):
    # Ensure all distributions are lists (not strings)
    distributions = [ast.literal_eval(dist) if isinstance(dist, str) else dist for dist in distributions]
    
    # Ensure all distributions are valid lists of numbers
    valid_distributions = [np.array(dist, dtype=float) for dist in distributions if isinstance(dist, list)]

    # Calculate the average distribution
    average_dist =  np.mean(np.array(valid_distributions), axis=0).tolist()
    
    return average_dist

# General function to calculate marginal distribution and merge it back into the DataFrame
def calculate_and_merge_marginal_distribution(df, groupby_columns, target_column, new_column_name):
    """
    Generalized function to calculate marginal distribution, reset index, and merge back to the original DataFrame.
    """
    marginal_df = (
        df.groupby(groupby_columns)[target_column]
        .apply(average_distributions)
        .reset_index(name=new_column_name)
    )
    return df.merge(marginal_df, on=groupby_columns, how='left')

Func to compute jsd and 1-jsd for the marginalized distributions

In [17]:
from scipy.spatial.distance import jensenshannon

# Function to calculate Jensen-Shannon Divergence (JSD)
def calculate_jsd(p, q):
    if len(p) == 0 or len(q) == 0:  # Check for empty distributions
        return None  # or return a specific value indicating no valid distributions
    return jensenshannon(p, q)

Code to compute jsd and similarity (1-jsd) for 
- each question topic
- marginal distribution distribution

In [18]:
# for each question_topic, calculate mean of jsd and similarity per country and save as different df 
# save all dfs in a dictionary

# Calculate mean of 'jsd' per 'question_topic', Image'. This seems useless
filtered_final_result['question_topic_mean_jsd'] = filtered_final_result.groupby(['question_topic', 'Image', 'model_size'])['jsd'].transform('mean')
# Calculate mean of 'similarity' per 'question_topic','Image'. Thiss eems useless

filtered_final_result['question_topic_mean_similarity'] = filtered_final_result.groupby(['question_topic', 'Image', 'model_size'])['similarity'].transform('mean')
# Calculate mean of 'question_topic_mean_similarity' per country
filtered_final_result['question_topic_country_mean_similarity'] = filtered_final_result.groupby(['question_topic', 'country', 'Image', 'model_size'])['similarity'].transform('mean')

filtered_final_result = calculate_and_merge_marginal_distribution(
    filtered_final_result, ['question_topic', 'question_text', 'Image', 'model_size'], 'prob_percent_values', 'md_topic_pred'
)

filtered_final_result = calculate_and_merge_marginal_distribution(
    filtered_final_result, ['question_topic', 'question_text', 'Image', 'model_size'], 'selection_answers', 'md_topic_gt'
)

# Step 3 and 4: Calculate marginalized model prediction and ground truth across all images per question only
filtered_final_result = calculate_and_merge_marginal_distribution(
    filtered_final_result, ['question_text', 'Image', 'model_size'], 'prob_percent_values', 'md_all_pred'
)

filtered_final_result = calculate_and_merge_marginal_distribution(
    filtered_final_result, ['question_text', 'Image', 'model_size'], 'selection_answers', 'md_all_gt'
)

# Calculate JSD for each question topic
filtered_final_result['md_jsd_topic'] = filtered_final_result.apply(
    lambda row: calculate_jsd(row['md_topic_pred'], row['md_topic_gt']), axis=1
)
filtered_final_result['md_sim_topic'] = 1 - filtered_final_result['md_jsd_topic']

# Calculate JSD for overall predictions and ground truth
filtered_final_result['md_jsd_overall'] = filtered_final_result.apply(
    lambda row: calculate_jsd(row['md_all_pred'], row['md_all_gt']), axis=1
)
filtered_final_result['md_sim_overall'] = 1 - filtered_final_result['md_jsd_overall']

# print column names
filtered_final_result.columns

Index(['img_id', 'image_path', 'country', 'image_code', 'income',
       'question_text', 'country_prompt', 'generic_prompt', 'option_labels',
       'full_options', 'prompt', 'options', 'top10_token_prob',
       'prob_percent_sorted', 'sum_prob_percent_sorted', 'prob_percent_keys',
       'prob_percent_values', 'selection_answers', 'img_category', 'jsd',
       'similarity', 'Image', 'model_size', 'country_mean_jsd',
       'coutry_mean_similarity', 'question_topic', 'question_topic_mean_jsd',
       'question_topic_mean_similarity',
       'question_topic_country_mean_similarity', 'md_topic_pred',
       'md_topic_gt', 'md_all_pred', 'md_all_gt', 'md_jsd_topic',
       'md_sim_topic', 'md_jsd_overall', 'md_sim_overall'],
      dtype='object')

In [19]:
filtered_final_result.shape

(60440, 37)

In [20]:
# # for each question_topic, calculate mean of jsd and similarity per country and save as different df 
# # save all dfs in a dictionary

# import shutil
# question_outputs_dir = 'cvqa_analysis/cvqa_ques_category'

# # delete directory if already exist
# if os.path.exists(question_outputs_dir):
#     shutil.rmtree(question_outputs_dir)
#     print(f"Existed! Deleted {question_outputs_dir} directory")

# # create directory
# os.makedirs(question_outputs_dir)
# print(f"Created {question_outputs_dir} directory")  

# # save final_result as csv
# filtered_final_result.to_csv(f"{question_outputs_dir}/all_category_results.csv", index=False)

# df_category_similarity = {}

# for category in filtered_final_result['question_topic'].unique():
#     df_category = filtered_final_result[filtered_final_result['question_topic'] == category]

#     # calculate mean of jsd and similarity per country
#     df_category['question_topic_mean_jsd'] = df_category.groupby(['country', 'Image'])['jsd'].transform('mean')
#     df_category['question_topic_mean_similarity'] = df_category.groupby(['country', 'Image'])['similarity'].transform('mean')

#     df_category_similarity[category] = df_category

#     df_category_similarity[category].to_csv(f"{question_outputs_dir}/{category}_results.csv", index=False)


### Save final df per (question)topic category

Directory to save csv files

In [21]:
import shutil
from tqdm import tqdm

question_topic_ouput_dir = 'topic_csv'

# delete directory if already exist. Include subdirectories
if os.path.exists(question_topic_ouput_dir):
    shutil.rmtree(question_topic_ouput_dir)
    print(f"Existed! Deleted {question_topic_ouput_dir} directory")

# create directory
os.makedirs(question_topic_ouput_dir)
print(f"Question Topic Level Data will be saved at: {question_topic_ouput_dir}")  

Existed! Deleted topic_csv directory
Question Topic Level Data will be saved at: topic_csv


In [22]:
unique_img_categories = filtered_final_result['img_category'].unique()
print(unique_img_categories)

['People_and_everyday_life' 'sports_and_recreation'
 'public_figure_and_pop_culture' 'Objects' 'Brands' 'Geography'
 'Cooking_and_food' 'tradition']


Save csv files

In [23]:

filtered_final_result.to_csv(f"{question_topic_ouput_dir}/all_results.csv", index=False)
print(f"Saved data: {question_topic_ouput_dir}/all_results.csv")

# Create separate DataFrames per question topic and save them
df_category_similarity = {}
question_topics = sorted(filtered_final_result['question_topic'].unique())

for topic in tqdm(question_topics):
    df_question_topic = filtered_final_result[filtered_final_result['question_topic'] == topic]
    df_category_similarity[topic] = df_question_topic
    df_question_topic.to_csv(f"{question_topic_ouput_dir}/{topic}.csv", index=False)
    print(f"Saved data: {question_topic_ouput_dir}/{topic}.csv")

print(f"Question Topic Level Data saved at: {question_topic_ouput_dir}")


Saved data: topic_csv/all_results.csv


  0%|          | 0/15 [00:00<?, ?it/s]

Saved data: topic_csv/A. Social values and attitudes.csv


 13%|█▎        | 2/15 [00:00<00:04,  2.78it/s]

Saved data: topic_csv/B. Religion and spirituality.csv


 20%|██        | 3/15 [00:01<00:05,  2.26it/s]

Saved data: topic_csv/C. Science and technology.csv


 33%|███▎      | 5/15 [00:04<00:09,  1.07it/s]

Saved data: topic_csv/D. Politics and policy.csv
Saved data: topic_csv/E. Demographics.csv


 40%|████      | 6/15 [00:04<00:06,  1.35it/s]

Saved data: topic_csv/G. International affairs.csv


 47%|████▋     | 7/15 [00:05<00:04,  1.62it/s]

Saved data: topic_csv/I. Gender and LGBTQ.csv


 53%|█████▎    | 8/15 [00:05<00:03,  1.95it/s]

Saved data: topic_csv/J. News habits and media.csv


 60%|██████    | 9/15 [00:05<00:03,  1.94it/s]

Saved data: topic_csv/K. Immigration and migration.csv


 73%|███████▎  | 11/15 [00:07<00:02,  1.60it/s]

Saved data: topic_csv/L. Family and relationships.csv
Saved data: topic_csv/M. Race and ethnicity.csv


 80%|████████  | 12/15 [00:08<00:02,  1.21it/s]

Saved data: topic_csv/N. Economy and work.csv


 87%|████████▋ | 13/15 [00:10<00:01,  1.04it/s]

Saved data: topic_csv/O. Regions and countries.csv


 93%|█████████▎| 14/15 [00:16<00:02,  2.70s/it]

Saved data: topic_csv/P. Methodological research.csv


100%|██████████| 15/15 [00:17<00:00,  1.16s/it]

Saved data: topic_csv/Q. Security.csv
Question Topic Level Data saved at: topic_csv





In [24]:
df_category_similarity.keys()

dict_keys(['A. Social values and attitudes', 'B. Religion and spirituality', 'C. Science and technology', 'D. Politics and policy', 'E. Demographics', 'G. International affairs', 'I. Gender and LGBTQ', 'J. News habits and media', 'K. Immigration and migration', 'L. Family and relationships', 'M. Race and ethnicity', 'N. Economy and work', 'O. Regions and countries', 'P. Methodological research', 'Q. Security'])

In [25]:
so that notebook breaks here if you run from top

SyntaxError: invalid syntax (3735556707.py, line 1)

### Performing Bootstrapping for Statistical Significance

In [26]:
import numpy as np
import pandas as pd

# Ensure 'similarity' is numeric
filtered_final_result['similarity'] = pd.to_numeric(filtered_final_result['similarity'], errors='coerce')
# Drop rows with NaN similarity scores
filtered_final_result = filtered_final_result.dropna(subset=['similarity'])


### Create a function to perform bootstrapping on your similarity scores.

In [None]:
# def bootstrap_mean(data, n_bootstrap=10000):
#     n = len(data)
#     bootstrap_means = np.empty(n_bootstrap)
#     for i in range(n_bootstrap):
#         sample = np.random.choice(data, size=n, replace=True)
#         bootstrap_means[i] = np.mean(sample)
#     return bootstrap_means


### Calculate Confidence Intervals and P-values

Perform bootstrapping for each group (e.g., per question_topic, Image, model_size) to compute confidence intervals and p-values.

In [None]:
# bootstrap_results = []

# # Group by 'question_topic', 'Image', 'model_size'
# grouped = filtered_final_result.groupby(['question_topic', 'Image', 'model_size'])

# for name, group in grouped:
#     similarity_scores = group['similarity'].values
    
#     # Skip groups with insufficient data
#     if len(similarity_scores) < 5:
#         continue
    
#     # Perform bootstrapping
#     bootstrap_means = bootstrap_mean(similarity_scores)
    
#     # Calculate observed mean
#     observed_mean = np.mean(similarity_scores)
    
#     # Calculate 95% confidence interval
#     ci_lower = np.percentile(bootstrap_means, 2.5)
#     ci_upper = np.percentile(bootstrap_means, 97.5)
    
#     # Perform hypothesis testing (e.g., test if mean similarity > 0.5)
#     p_value = np.mean(bootstrap_means <= 0.5)
    
#     # Store results
#     bootstrap_results.append({
#         'question_topic': name[0],
#         'Image': name[1],
#         'model_size': name[2],
#         'observed_mean_similarity': observed_mean,
#         'ci_lower': ci_lower,
#         'ci_upper': ci_upper,
#         'p_value': p_value
#     })

# # Convert results to DataFrame
# bootstrap_df = pd.DataFrame(bootstrap_results)

# # Display results
# print(bootstrap_df)

### Comparing Groups Using Hypothesis Testing


In [None]:
import numpy as np
import pandas as pd
from decimal import Decimal, getcontext

# Set precision for Decimal operations
getcontext().prec = 20  # Adjust precision as needed

def bootstrap_mean_diff(data1, data2, n_bootstrap=100000):
    observed_diff = np.mean(data1) - np.mean(data2)
    combined = np.concatenate([data1, data2])
    n1 = len(data1)
    mean_diffs = np.empty(n_bootstrap)
    for i in range(n_bootstrap):
        np.random.shuffle(combined)
        sample1 = combined[:n1]
        sample2 = combined[n1:]
        mean_diffs[i] = np.mean(sample1) - np.mean(sample2)
    extreme_count = np.sum(np.abs(mean_diffs) >= np.abs(observed_diff))
    # Convert extreme_count to standard Python int
    p_value = Decimal(int(extreme_count)) / Decimal(n_bootstrap)
    return observed_diff, p_value

bootstrap_results = []

grouped = filtered_final_result.groupby(['question_topic', 'model_size'])

for name, group in grouped:
    group_true = group[group['Image'] == 'True']['similarity'].values
    group_false = group[group['Image'] == 'False']['similarity'].values
    if len(group_true) < 5 or len(group_false) < 5:
        continue
    observed_diff, p_value = bootstrap_mean_diff(group_true, group_false, n_bootstrap=1000000)
    bootstrap_results.append({
        'question_topic': name[0],
        'model_size': name[1],
        # 'observed_mean_diff': observed_diff,
        'p_value': float(p_value)
    })

bootstrap_df = pd.DataFrame(bootstrap_results)

# Adjust display format to show p-values with higher precision
pd.set_option('display.float_format', '{:.10e}'.format)

print(bootstrap_df)

In [None]:
# Adjust display format to show p-values with higher precision
pd.set_option('display.float_format', '{:.10e}'.format)

print(bootstrap_df)

In [None]:
Break again here

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Get the unique categories
categories = df_category_similarity.keys()

# Determine the number of subplots needed
num_categories = len(categories)
num_cols = 2
num_rows = (num_categories + 1) // num_cols

# Create a figure with subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, num_rows * 4))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Initialize variables for the legend
legend_handles = None
legend_labels = None

# Iterate through each category and create a subplot
for idx, category in enumerate(categories):
    ax = axes[idx]
    
    # Get the dataframe for the current category
    cat_df = df_category_similarity[category]
    
    # Filter and sort the dataframe
    cat_df = cat_df[['country', 'question_topic_mean_jsd', 'question_topic_mean_similarity', 'Image']].drop_duplicates()
    cat_df = cat_df.sort_values(['country', 'Image'])
    
    # Filter cat_df to only include countries in results_country_order
    cat_df = cat_df[cat_df['country'].isin(results_country_order)]
    
    # If cat_df is empty, skip this plot
    if cat_df.empty:
        # Remove the axis
        fig.delaxes(ax)
        continue
    
    # Set 'country' as a categorical variable with ordered categories
    cat_df['country'] = pd.Categorical(cat_df['country'], categories=results_country_order, ordered=True)
    
    # Sort cat_df by 'country' to ensure correct order
    cat_df = cat_df.sort_values('country')
    
    # Create a bar plot
    sns.barplot(
        x='country',
        y='question_topic_mean_similarity',
        hue='Image',
        data=cat_df,
        palette='viridis',
        ax=ax
    )
    
    # Collect legend handles and labels from the first subplot
    if legend_handles is None:
        handles, labels = ax.get_legend_handles_labels()
        new_labels = ['Only Country, No Image' if label == 'False' else 'Only Image, No Country' for label in labels]
        legend_handles = handles
        legend_labels = new_labels
    
    # Remove legend from the subplot
    ax.get_legend().remove()
    
    # Add labels and title
    ax.set_xlabel('Country')
    ax.set_ylabel('Question Topic Mean Similarity')
    ax.set_title(f'{category}', fontsize=10)
    
    # Rotate x-axis labels for better readability
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    
# Remove any unused subplots
for idx in range(num_categories, len(axes)):
    fig.delaxes(axes[idx])

# Add a single legend to the figure
fig.legend(legend_handles, legend_labels, title='Image', fontsize=8, loc='upper right')

# Adjust layout with vertical space between subplots
plt.tight_layout()
plt.subplots_adjust(hspace=1, right=0.85)  # Adjust 'right' to make space for the legend
plt.show()


#### Sort countries in order of income (High Income to Low Income)

In [None]:
# rich_countries = ["Russia", "Romania", "France", "Spain", "South Korea"]
# medium_countries = ["Brazil", "Indonesia", "Mexico", "Philippines", "Mongolia", "China", "Colombia"]
# poor_countries = ["Ethiopia", "Nigeria", "Egypt", "Kenya", "Pakistan"]


# country_order= rich_countries + medium_countries + poor_countries 
# print(len(country_order))
# # sort final result by country_order and put it in a new dataframe
# final_result_sorted = final_result.set_index('country').loc[country_order].reset_index()
# final_result_sorted

### Set the color palette for Countries Mapped to Income Level

In [None]:
country_mapping = {}
for country in rich_countries:
    country_mapping[country] = 'rich'
for country in medium_countries:
    country_mapping[country] = 'medium'
for country in poor_countries:
    country_mapping[country] = 'poor'

# Define a color palette for the categories
# Define a color palette for the categories
category_palette = {
    'rich': 'darkgreen',
    'medium': 'lightgreen',
    'poor': 'lightcoral'
}

# Map each country to its respective category
final_result_sorted['country_group'] = final_result_sorted['country'].map(country_mapping)
# Map colors based on the category_group
final_result_sorted['income_color'] = final_result_sorted['country_group'].map(category_palette)
final_result_sorted


### Set the color palette for Countries Mapped to Region

In [None]:
country_regions = {"Russia": "Europe", "Romania": "Europe", "France": "Europe", "Spain": "Europe", "South Korea": "Asia",
                "Brazil": "South America", "Indonesia": "Asia", "Mexico": "North America", "Philippines": "Asia",
                "Mongolia": "Asia", "China": "Asia", "Colombia": "South America", "Ethiopia": "Africa",
                "Nigeria": "Africa", "Egypt": "Africa", "Kenya": "Africa", "Pakistan": "Asia"}

final_result_sorted['region'] = final_result_sorted['country'].map(country_regions)

region_palette = {
    'Europe': 'yellowgreen',
    'Asia': 'orangered',
    'Africa': 'darkseagreen',
    'North America': 'turquoise',
    'South America': 'paleturquoise'
}

final_result_sorted['region_color'] = final_result_sorted['region'].map(region_palette)
final_result_sorted

## Plot the Heatmap for Similarity
- This is plotted for each country (across all categories)
- Similarity = 1 - JSD
- Origin: Mean Similarity across all countries
- Distance Per Country = Similarity Score Per Country - Origin (Mean Similarity Score) 
- Higher Similarity Distance = Closer to Human Distriubtion

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))

# Calculate the mean similarity value
origin = final_result_sorted['similarity'].mean()

# Sort data based on similarity
final_result_sorted = final_result_sorted.sort_values(by='similarity', ascending=False)

# Calculate deviation from the mean (center the bars at the mean similarity)
final_result_sorted['similarity_centered'] = final_result_sorted['similarity'] - origin

mapped_dict = dict(zip(final_result_sorted['country'], final_result_sorted['income_color']))

# Plot vertical bar chart centered on the mean similarity
bars = sns.barplot(
    x='country',
    y='similarity_centered',
    data=final_result_sorted,
    palette=mapped_dict
)

# Add a horizontal line for the centered mean (origin)
# plt.axhline(0, color='blue', linestyle='--', label=f'Mean similarity = {origin:.2f}')

# Set the y-axis limits to highlight the range
plt.ylim(-0.085, 0.075)

# Remove x-axis labels
plt.xticks([])

# Add labels and title
plt.ylabel('Similarity Distance Score - Income Wise')
plt.xlabel('')  # Optionally, remove x-axis label as well
plt.title('Similarity Distance Score For Each Country')

# Display legend about color mapping. Top right corner. Label: income, color: color
# Display in a bo
# Create custom legend handles
legend_handles = [plt.Line2D([0], [0], color=color, lw=4) for color in category_palette.values()]

plt.legend(
    handles=legend_handles,
    labels=[f'{key}' for key, value in category_palette.items()],
    title='Income',
    loc='upper right',
    fontsize=8
)

# Annotate bars with the country names
for bar, country in zip(bars.patches, final_result_sorted['country']):
    height = bar.get_height()
    # Place text slightly above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + (0.001 if height >= 0 else -0.001),  # Adjust position based on height
        country,
        ha='center',
        va='bottom' if height >= 0 else 'top',  # Adjust alignment based on direction
        fontsize=8,
        rotation=90  # Rotate text for better readability
    )

plt.tight_layout()
plt.subplots_adjust(bottom=0.2)
# Show the plot
plt.show()


### Plot of Similarity Heatmap for each country (across all categories) - Region Wise

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))

# Calculate the mean similarity value
origin = final_result_sorted['similarity'].mean()

# Sort data based on similarity
final_result_sorted = final_result_sorted.sort_values(by='similarity', ascending=False)

# Calculate deviation from the mean (center the bars at the mean similarity)
final_result_sorted['similarity_centered'] = final_result_sorted['similarity'] - origin

mapped_dict = dict(zip(final_result_sorted['country'], final_result_sorted['region_color']))

# Plot vertical bar chart centered on the mean similarity
bars = sns.barplot(
    x='country',
    y='similarity_centered',
    data=final_result_sorted,
    palette=mapped_dict
)

# Add a horizontal line for the centered mean (origin)
# plt.axhline(0, color='blue', linestyle='--', label=f'Mean similarity = {origin:.2f}')

# Set the y-axis limits to highlight the range
plt.ylim(-0.085, 0.075)

# Remove x-axis labels
plt.xticks([])

# Add labels and title
plt.ylabel('Similarity Distance Score: Region Wise')
plt.xlabel('')  # Optionally, remove x-axis label as well
plt.title('Similarity Distance Score For Each Country')

# Display legend with color mapping. Top right corner. Label: region, color: color
# Display in a box
# Create custom legend handles
legend_handles = [plt.Line2D([0], [0], color=color, lw=4) for color in region_palette.values()]

plt.legend(
    handles=legend_handles,
    labels=[f'{key}' for key, value in region_palette.items()],
    title='Region',
    loc='upper right',
    fontsize=8
)

# Annotate bars with the country names
for bar, country in zip(bars.patches, final_result_sorted['country']):
    height = bar.get_height()
    # Place text slightly above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + (0.001 if height >= 0 else -0.001),  # Adjust position based on height
        country,
        ha='center',
        va='bottom' if height >= 0 else 'top',  # Adjust alignment based on direction
        fontsize=8,
        rotation=90  # Rotate text for better readability
    )

plt.tight_layout()
plt.subplots_adjust(bottom=0.2)
# Show the plot
plt.show()


### Plot For Normalized Similarity Distance

In [None]:
## Plot Normalzed Similarity Heatmap for each country

# import numpy as np
sim_values = final_result_sorted['similarity'].to_list()
min_sim = np.min(sim_values)
max_sim = np.max(sim_values)
rescaled_sim = [(sim - min_sim) / (max_sim - min_sim) for sim in sim_values]

final_result_sorted['rescaled_similarity'] = rescaled_sim

#---------------------------------------------------------------------------------
plt.figure(figsize=(10, 6))


# Sort data based on rescaled imilarity
final_result_sorted = final_result_sorted.sort_values(by='rescaled_similarity', ascending=False)

# Map colors based on the category_group
# mapped_palette = final_result_sorted['country_group'].map(final_result_sorted['income_color'])
mapped_dict = dict(zip(final_result_sorted['country'], final_result_sorted['income_color']))

# Plot vertical bar chart centered on the mean similarity
bars = sns.barplot(
    x='country',
    y='rescaled_similarity',
    data=final_result_sorted,
    palette=mapped_dict
)

# Add a horizontal line for the centered mean (origin)
plt.axhline(0.5, color='blue', linestyle='--', label=f'Normalized Similarity')

# Set the y-axis limits to highlight the range
plt.ylim(0, 1.2)

# Remove x-axis labels
plt.xticks([])

# Add labels and title
plt.ylabel('Normalized Similarity Distance Score - Income Wise')
plt.xlabel('')  # Optionally, remove x-axis label as well
plt.title('Normalized Similarity Distance Score For Each Country')

# Display legend about color mapping. Top right corner. Label: income, color: color
# Display in a bo
# Create custom legend handles
legend_handles = [plt.Line2D([0], [0], color=color, lw=4) for color in category_palette.values()]

plt.legend(
    handles=legend_handles,
    labels=[f'{key}' for key, value in category_palette.items()],
    title='Income',
    loc='upper right',
    fontsize=8
)

# Annotate bars with the country names
for bar, country in zip(bars.patches, final_result_sorted['country']):
    height = bar.get_height()
    # Place text slightly above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + (0.001 if height >= 0 else -0.001),  # Adjust position based on height
        country,
        ha='center',
        va='bottom' if height >= 0 else 'top',  # Adjust alignment based on direction
        fontsize=8,
        rotation=90  # Rotate text for better readability
    )

plt.tight_layout()
plt.subplots_adjust(bottom=0.2)
# Show the plot
plt.show()


### Plot for Normalized Similarity Distance region wise

In [None]:
## Plot Normalzed Similarity Heatmap for each country

# import numpy as np
sim_values = final_result_sorted['similarity'].to_list()
min_sim = np.min(sim_values)
max_sim = np.max(sim_values)
rescaled_sim = [(sim - min_sim) / (max_sim - min_sim) for sim in sim_values]

final_result_sorted['rescaled_similarity'] = rescaled_sim

#---------------------------------------------------------------------------------
plt.figure(figsize=(10, 6))


# Sort data based on rescaled imilarity
final_result_sorted = final_result_sorted.sort_values(by='rescaled_similarity', ascending=False)

# Map colors based on the category_group
# mapped_palette = final_result_sorted['country_group'].map(final_result_sorted['region_color'])
mapped_dict = dict(zip(final_result_sorted['country'], final_result_sorted['region_color']))

# Plot vertical bar chart centered on the mean similarity
bars = sns.barplot(
    x='country',
    y='rescaled_similarity',
    data=final_result_sorted,
    palette=mapped_dict
)

# Add a horizontal line for the centered mean (origin)
plt.axhline(0.5, color='blue', linestyle='--', label=f'Normalized Similarity: 0.5')

# Set the y-axis limits to highlight the range
plt.ylim(0, 1.2)

# Remove x-axis labels
plt.xticks([])

# Add labels and title
plt.ylabel('Normalized Similarity Distance Score - Region Wise')
plt.xlabel('')  # Optionally, remove x-axis label as well
plt.title('Normalized Similarity Distance Score For Each Country')

# Display legend
# Display legend about color mapping. Top right corner. Label: region, color: color
# Display in a bo
# Create custom legend handles
legend_handles = [plt.Line2D([0], [0], color=color, lw=4) for color in region_palette.values()]

plt.legend(
    handles=legend_handles,
    labels=[f'{key}' for key, value in region_palette.items()],
    title='Region',
    loc='upper right',
    fontsize=8
)

# Annotate bars with the country names
for bar, country in zip(bars.patches, final_result_sorted['country']):
    height = bar.get_height()
    # Place text slightly above the bar
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        height + (0.001 if height >= 0 else -0.001),  # Adjust position based on height
        country,
        ha='center',
        va='bottom' if height >= 0 else 'top',  # Adjust alignment based on direction
        fontsize=8,
        rotation=90  # Rotate text for better readability
    )

plt.tight_layout()
plt.subplots_adjust(bottom=0.2)
# Show the plot
plt.show()


---------------------------------------

## Computer JSD, SIM Per Category Per Countries

In [None]:
results = []

for f in files:
    data = pd.read_csv(os.path.join(directory, f))

    # Add category column to the DataFrame
    data['category'] = data['image_path'].apply(lambda x: x.split('/')[-1].split('.')[0].split('_')[1:-2])
    data['category'] = data['category'].apply(lambda x: '_'.join(x))

    # Example gt and pred (list of lists)
    gt = [ast.literal_eval(x) for x in data['selection_answers'].tolist()]
    pred = [ast.literal_eval(x) for x in data['prob_percent_values'].tolist()]

    # Original DataFrame, retaining all columns from `data`
    df = data.copy()

    # Keep track of matching indices
    matching_indices = [i for i, (g, p) in enumerate(zip(gt, pred)) if len(g) == len(p)]

    # Filter rows where len(g) == len(p)
    filtered_gt_pred = [(g, p) for g, p in zip(gt, pred) if len(g) == len(p)]

    # If there are any matching rows
    if filtered_gt_pred:
        gt_filtered, pred_filtered = zip(*filtered_gt_pred)

        # Calculate JSD
        jsd = [distance.jensenshannon(g, p) for g, p in zip(gt_filtered, pred_filtered)]
        similarity = [1 - value for value in jsd]

        # Add JSD and similarity values back to the DataFrame
        df.loc[matching_indices, 'jsd'] = jsd
        df.loc[matching_indices, 'similarity'] = similarity
    else:
        df['jsd'] = None  # or handle the case accordingly
        df['similarity'] = None

    # Set JSD and similarity to NaN or a placeholder for rows that were filtered out
    df['jsd'] = pd.to_numeric(df['jsd'], errors='coerce')
    df['similarity'] = pd.to_numeric(df['similarity'], errors='coerce')

    # Calculate mean of jsd and similarity per country and per category
    mean_per_country_category = df.groupby(['country', 'category'])[['jsd', 'similarity']].mean().reset_index()

    # Append results to list
    results.append(mean_per_country_category)

# Combine all results into a single DataFrame
final_result = pd.concat(results, ignore_index=True)

In [None]:
final_result_sorted = final_result.set_index('country').loc[country_order].reset_index()
# Create a new column in the DataFrame for country category
final_result_sorted['category_group'] = final_result_sorted['country'].map(country_mapping)
final_result_sorted['category_color'] = final_result_sorted['category_group'].map(category_palette)

In [None]:
final_result_sorted['region'] = final_result_sorted['country'].map(country_regions)
final_result_sorted['region_color'] = final_result_sorted['region'].map(region_palette)

## Plot the Heatmap for Similarity for each category

In [None]:
income_dir = "income_category"
if not os.path.exists(income_dir):
    os.makedirs(income_dir)

# Plot similarity per category for all countries and save each plot
categories = final_result_sorted['category'].unique()

for category in categories:
    plt.figure(figsize=(12, 6))

    # Filter data for the current category (include all countries under this category)
    category_data = final_result_sorted[final_result_sorted['category'] == category]

    # # Drop NaNs in the similarity column
    # category_data = category_data.dropna(subset=['similarity'])

    # Sort data based on similarity for better visualization
    category_data = category_data.sort_values(by='similarity', ascending=False)

    # Calculate the mean similarity value for the current category across all countries
    origin = category_data['similarity'].mean()
    # print(f"origin: {origin}")

    # Calculate deviation from the mean (center the bars at the mean similarity)
    category_data['similarity_centered'] = category_data['similarity'] - origin

    # Map colors based on the category_group
    mapped_palette = category_data['category_group'].map(category_palette)

    # Ensure no missing values in the mapped palette
    if mapped_palette.isnull().any():
        print("Warning: Some category groups are not mapped to colors.")
        print(category_data[mapped_palette.isnull()])

    mapped_dict = dict(zip(category_data['country'], mapped_palette))
    # Plot vertical bar chart centered on the mean similarity
    bars = sns.barplot(
        x='country',
        y='similarity_centered',
        data=category_data,
        palette=mapped_dict
    )

    # Add a horizontal line for the centered mean (origin)
    plt.axhline(0, color='blue', linestyle='--', label=f'Mean similarity = {origin:.2f}')

    # Set the y-axis limits to highlight the range
    plt.ylim(-0.065, 0.065)

    # Remove x-axis labels
    plt.xticks([])

    # Add labels and title
    plt.ylabel('Similarity Deviation from Mean')
    plt.xlabel('Country')
    plt.title(f'Countries by Deviation in Similarity from the Mean for Category: {category}')

    # Display legend
    # Display legend about color mapping. Top right corner. Label: income, color: color
    # Display in a box
    # Create custom legend handles
    legend_handles = [plt.Line2D([0], [0], color=color, lw=4) for color in category_palette.values()]

    plt.legend(
        handles=legend_handles,
        labels=[f'{key}' for key, value in category_palette.items()],
        title='Income',
        loc='upper right',
        fontsize=8
    )

    # Annotate bars with the country names
    for bar, country in zip(bars.patches, category_data['country']):
        height = bar.get_height()
        # Place text slightly above the bar
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + (0.01 if height >= 0 else -0.01),  # Adjust position based on height
            country,
            ha='center',
            va='bottom' if height >= 0 else 'top',  # Adjust alignment based on direction
            fontsize=6,
            rotation=90  # Rotate text for better readability
        )
    # Save the plot as a file
    plt.savefig(f'{income_dir}/{category}.png', bbox_inches='tight')
    print(f"Saved category: {category}")

    # Close the plot to free memory
    plt.close()

### Plot of Similarity Heatmap for each category (across all countries) - Region Wise

In [None]:
region_dir = "regions_categories"
if not os.path.exists(region_dir):
    os.makedirs(region_dir)

# Plot similarity per category for all countries and save each plot
for category in categories:
    plt.figure(figsize=(12, 6))

    # Filter data for the current category (include all countries under this category)
    category_data = final_result_sorted[final_result_sorted['category'] == category]

    # # Drop NaNs in the similarity column
    # category_data = category_data.dropna(subset=['similarity'])

    # Sort data based on similarity for better visualization
    category_data = category_data.sort_values(by='similarity', ascending=False)

    # Calculate the mean similarity value for the current category across all countries
    origin = category_data['similarity'].mean()
    # print(f"origin: {origin}")

    # Calculate deviation from the mean (center the bars at the mean similarity)
    category_data['similarity_centered'] = category_data['similarity'] - origin

    # Map colors based on the category_group
    mapped_palette = category_data['region_color']

    # Ensure no missing values in the mapped palette
    if mapped_palette.isnull().any():
        print("Warning: Some category groups are not mapped to colors.")
        print(category_data[mapped_palette.isnull()])

    mapped_dict = dict(zip(category_data['country'], mapped_palette))
    # Plot vertical bar chart centered on the mean similarity
    bars = sns.barplot(
        x='country',
        y='similarity_centered',
        data=category_data,
        palette=mapped_dict
    )

    # Add a horizontal line for the centered mean (origin)
    # plt.axhline(0, color='blue', linestyle='--', label=f'Mean similarity = {origin:.2f}')

    # Set the y-axis limits to highlight the range
    plt.ylim(-0.065, 0.065)

    # Remove x-axis labels
    plt.xticks([])

    # Add labels and title
    plt.ylabel('Similarity Deviation from Mean')
    plt.xlabel('Country')
    plt.title(f'Countries by Deviation in Similarity from the Mean for Category: {category}')

    # Display legend
    # Display legend about color mapping. Top right corner. Label: region, color: color
    # Display in a box
    # Create custom legend handles
    legend_handles = [plt.Line2D([0], [0], color=color, lw=4) for color in region_palette.values()]

    plt.legend(
        handles=legend_handles,
        labels=[f'{key}' for key, value in region_palette.items()],
        title='Region',
        loc='upper right',
        fontsize=8
    )

    # Annotate bars with the country names
    for bar, country in zip(bars.patches, category_data['country']):
        height = bar.get_height()
        # Place text slightly above the bar
        plt.text(
            bar.get_x() + bar.get_width() / 2,
            height + (0.01 if height >= 0 else -0.01),  # Adjust position based on height
            country,
            ha='center',
            va='bottom' if height >= 0 else 'top',  # Adjust alignment based on direction
            fontsize=6,
            rotation=90  # Rotate text for better readability  
        )

    # Save the plot as a file
    plt.savefig(f'{region_dir}/similarity_per_category_{category}.png', bbox_inches='tight')
    print(f"Saved category: {category}")

    # Close the plot to free memory
    plt.close()

-----------------------------------------------------------------------