In [None]:
import os
import pandas as pd
from tqdm import tqdm

Read files


In [None]:
# read files from output directory
output_dir = '../output_allimgs'
output_files = os.listdir(output_dir)
output_files 

In [None]:
from datasets import load_dataset

go_dataset = load_dataset("Anthropic/llm_global_opinions")

# load as a pandas dataframe where 'source' == wvs
go_dataset_df = go_dataset['train'].to_pandas()
go_dataset_wvs = go_dataset_df[go_dataset_df['source'] == 'WVS']
go_dataset_wvs.head()

In [None]:
import shutil
img_comparison = False
img_country_comparison = True

if img_comparison:
    setting_1 = 'Image True, Country False'
    setting_2 = 'Image False, Country False'
    file_1 = os.path.join(output_dir, 'llava_img_True_country_False.csv')
    file_2 = os.path.join(output_dir, 'llava_img_False_country_False.csv')
    image_dir = 'income_comparison_images_true_country_false'
elif img_country_comparison:
    setting_1 = 'Image True, Country True'
    setting_2 = 'Image False, Country True'
    file_1 = os.path.join(output_dir, 'llava_img_True_country_True.csv')
    file_2 = os.path.join(output_dir, 'llava_img_False_country_True.csv')
    image_dir = 'income_comparison_images_true_country_true'

# delete all files in the directory if exists
if os.path.exists(image_dir):
    shutil.rmtree(image_dir)
os.makedirs(image_dir)

df_1 = pd.read_csv(file_1) # image true
df_2 = pd.read_csv(file_2) # image false

In [None]:
import ast

def convert_to_list(x):
    x = ast.literal_eval(x) 
    x_list = list(x.values()) if isinstance(x, dict) else list(x)
    return x_list

def get_distribution_list(df, options_prob_col, wvs_distribution_col):
    options_prob_dict = [convert_to_list(val) for val in df[options_prob_col]]
    df['options_prob_list'] = pd.Series(options_prob_dict)
    
    wvs_distribution = [convert_to_list(val) for val in df[wvs_distribution_col]]
    df['wvs_distribution_list'] = wvs_distribution
    
    return df

# Assuming df_1 and df_2 are already defined DataFrames
df_1 = get_distribution_list(df_1, 'options_prob', 'wvs_distribution')
df_2 = get_distribution_list(df_2, 'options_prob', 'wvs_distribution')

In [None]:
df_1.shape, df_2.shape

In [None]:
wvs_questions = go_dataset_wvs['question'].unique()
model_questions = df_1['question'].unique()

common_questions = set(wvs_questions).intersection(set(model_questions))
len(common_questions)

# for all unique value in questions column, find "options" value from go_dataset_wvs and add to df_aggregated
def add_options_to_df(questions, df_aggregated, go_dataset_wvs):
    for question in questions:
        options = go_dataset_wvs[go_dataset_wvs['question'] == question]['options'].values[0]
        # create a new column with options
        # breakpoint()

        df_aggregated.loc[df_aggregated['question'] == question, 'options'] = options
    return df_aggregated

df_aggregated_1 = add_options_to_df(common_questions, df_1, go_dataset_wvs)
df_aggregated_2 = add_options_to_df(common_questions, df_2, go_dataset_wvs)

In [None]:
df_aggregated_1.shape, df_aggregated_2.shape 

#### Get income from dollarstreet data

In [None]:
from main.data_handling import DataLoader

data_loader = DataLoader("Anthropic/llm_global_opinions", "../data/dollarstreet/images_v2.csv")
dollarstreet_data = data_loader.get_dollarstreet_data()

# get 'id' and 'income' columns from dollarstreet_data
dollarstreet_id_income = dollarstreet_data[['id', 'income']]
dollarstreet_id_income.head()

#### add income to df_aggregated_1 and df_aggregated_2 as a new column

In [None]:
# add progress_apply to show progress bar
def add_income_to_df(df_aggregated, dollarstreet_id_income):
    df_aggregated['income'] = df_aggregated['image_id'].apply(lambda x: dollarstreet_id_income[dollarstreet_id_income['id'] == x]['income'].values[0])
    return df_aggregated

df_aggregated_1 = add_income_to_df(df_aggregated_1, dollarstreet_id_income)
df_aggregated_2 = add_income_to_df(df_aggregated_2, dollarstreet_id_income)

In [None]:
df_aggregated_1.shape, df_aggregated_2.shape

In [None]:
df_aggregated_1.head(2)

In [None]:
df_aggregated_2.head(2)

In [None]:
# # Initialize the new column with empty dictionaries

# def create_joint_dict(country, df, questions, use_image=False): # TODO: use_image to be used later
#     """ This function returns a dict which has questions as keys and distribution for that question as values"""
#     joint_dict = {}
#     # for each country all the unique images_ids
#     unique_image_ids = df[df['country'] == country]['image_id'].unique() 
#     all_image_ids = unique_image_ids if use_image else unique_image_ids[:1]
#     # for each image_id get the questions
#     for question in questions:
#         for image_id in all_image_ids:
#             joint_dict[country][image_id] =[
#                                                 [
#                                                 question, 
#                                                 image_id,
#                                                 df[(df['question'] == question) & (df['country'] == country) & (df['image_id'] == image_id)]['image_id'].values[0], 
#                                                 df[(df['question'] == question) & (df['country'] == country) & (df['image_id'] == image_id)]['options_prob_list'].values[0],
#                                                 df[(df['question'] == question) & (df['country'] == country) & (df['image_id'] == image_id)]['wvs_distribution_list'].values[0],
#                                                 df[(df['question'] == question) & (df['country'] == country) & (df['image_id'] == image_id)]['options'].values[0],
#                                                 df[(df['question'] == question) & (df['country'] == country) & (df['image_id'] == image_id)]['img_path'].values[0],
#                                                 df[(df['question'] == question) & (df['country'] == country) & (df['image_id'] == image_id)]['income'].values[0]
#                                                 ]
#                                             ]   
#     return joint_dict


# def create_joint_dict_all_countries(df1, df2):
#     countries = df1['country'].unique()
#     joint_dict_df1 = []
#     joint_dict_df2 = []
#     for country in tqdm(countries):
#         questions = df_1[df_1['country'] == country]['question'].unique()
#         joint_dict_df1.append(create_joint_dict(country, df1, questions, use_image=True))
#         joint_dict_df2.append(create_joint_dict(country, df2, questions, use_image=False))

#     return joint_dict_df1, joint_dict_df2

# joint_dict_country_df1, joint_dict_country_df2 = create_joint_dict_all_countries(df_aggregated_1, df_aggregated_2)

In [None]:
df_aggregated_1.head(2)

In [None]:
# calculate the JSD i options_prob_list and wvs_distribution_list columns
from scipy.spatial.distance import jensenshannon

def calculate_jsd(options_prob_list, wvs_distribution_list):
    return jensenshannon(options_prob_list, wvs_distribution_list)

# add similarity column to df_aggregated which is 1-JSD
# TODO: How to handle: invalid value encountered in divide q = q / np.sum(q, axis=axis, keepdims=True)

df_aggregated_1['similarity'] = df_aggregated_1.apply(lambda x: 1 - calculate_jsd(x['options_prob_list'], x['wvs_distribution_list']), axis=1)
df_aggregated_2['similarity'] = df_aggregated_2.apply(lambda x: 1 - calculate_jsd(x['options_prob_list'], x['wvs_distribution_list']), axis=1)

In [None]:
# check for country Nigeria, give unique income values
df_aggregated_1[df_aggregated_1['country'] == 'Nigeria']['income'].unique()

In [None]:
# import numpy as np

# # print number of questions for each image_id
# print(df_aggregated_1['image_id'].value_counts().head(2))
# print(df_aggregated_2['image_id'].value_counts().head(2))

# # print number of unique image_id per country
# print(df_aggregated_1['country'].value_counts().tail(2))
# print(df_aggregated_2['country'].value_counts().tail(2))


df_aggregated_1['mean_similarity'] = df_aggregated_1.groupby('image_id')['similarity'].transform('mean')
df_aggregated_2['mean_similarity'] = df_aggregated_2.groupby('image_id')['similarity'].transform('mean')

print(df_aggregated_1.shape, df_aggregated_2.shape)


In [None]:
# sort df_aggregated_1 by image_id 
df_aggregated_1 = df_aggregated_1.sort_values(by=['image_id', 'mean_similarity'], ascending=False)
df_aggregated_2 = df_aggregated_2.sort_values(by=['image_id', 'mean_similarity'], ascending=False)


In [97]:
# plot mean_similarity for each image_id for each country separately.
# each image_id has an income value, so we can plot mean_similarity vs income for each country

# output image dir
if os.path.exists(image_dir):
    shutil.rmtree(image_dir)
os.makedirs(image_dir)

import matplotlib.pyplot as plt

def plot_mean_similarity_vs_income(df_aggregated, image_dir):       
    countries = df_aggregated['country'].unique()
    for country in tqdm(countries):
        df_country = df_aggregated[df_aggregated['country'] == country]
        image_ids = df_country['image_id'].unique()
        # all image_ids of a country in same plot
        fig, ax = plt.subplots()
        for image_id in image_ids:
            df_image = df_country[df_country['image_id'] == image_id]
            breakpoint()
            ax.plot(df_image['income'].values[0], df_image['mean_similarity'].values[0], 'o')
        ax.set_xscale('log')
        ax.set_xlim(10, 20000)
        ax.set_xticks([10, 50, 100, 200, 500, 1000, 2000, 3000, 4000, 5000, 10000, 15000, 20000])
        ax.get_xaxis().set_major_formatter(plt.ScalarFormatter())
        # Rotate x axis labels 45 degrees
        plt.xticks(rotation=45, fontsize=8)
        ax.set_xlabel('Income in USD')
        ax.set_ylabel('Mean Similarity')
        ax.set_title(f'{country}')
        # ax.legend()
        plt.savefig(f'{image_dir}/{country}.png')
        plt.close()
    

plot_mean_similarity_vs_income(df_aggregated_1, image_dir)

100%|██████████| 42/42 [00:05<00:00,  7.57it/s]


In [None]:
# Plot the jsd values for each country side by side in bar chart
# make country as y-axis and jsd as x-axis

import matplotlib.pyplot as plt
import numpy as np

def plot_jsd(jsd_country_df1, jsd_country_df2):
    fig, ax = plt.subplots( figsize=(10, 20))
    # make country as y-axis and jsd as x-axis
    countries = list(jsd_country_df1.keys())
    jsd_values_df1 = list(jsd_country_df1.values())
    jsd_values_df2 = list(jsd_country_df2.values())
    x = np.arange(len(countries))
    width = 0.35

    # add the jsd values at the end of the bar
    for i, v in enumerate(jsd_values_df1):
        ax.text(v, i, str(round(v, 2)), color='blue', fontsize=8)
    # text is not alligned properly. make it a little down
    for i, v in enumerate(jsd_values_df2):
        ax.text(v, i-0.35, str(round(v, 2)), color='red', fontsize=8)

    ax.barh(x + width/2, jsd_values_df1, width, label=setting_1)
    ax.barh(x - width/2, jsd_values_df2, width, label=setting_2)

    ax.set_xlabel('Similarity (1-JSD)')
    ax.set_ylabel('Country')
    ax.set_yticks(x)
    ax.set_yticklabels(countries, fontsize=10)
    ax.legend()
    plt.tight_layout() # to make sure the labels are not cut off
    plt.show()

plot_jsd(jsd_country_df1, jsd_country_df2)

In [None]:

def plot_jsd_difference(jsd_country_df1, jsd_country_df2):
    # Calculate the JSD difference
    jsd_diff = {country: jsd_country_df1[country] - jsd_country_df2[country] for country in jsd_country_df1.keys()}
    jsd_diff = dict(sorted(jsd_diff.items(), key=lambda item: item[1], reverse=True))

    countries = list(jsd_diff.keys())
    jsd_diff_values = list(jsd_diff.values())
    x = np.arange(len(countries))

    # Create the plot
    fig, ax = plt.subplots(figsize=(15, 20))
    bars = ax.barh(x, jsd_diff_values, color='blue')
    
    # Set labels
    ax.set_xlabel(f'Similarity Difference ({setting_1} - {setting_2})')
    ax.set_ylabel('Country')

    # Remove the y-ticks and labels (we will add custom labels)
    ax.set_yticks([])
    
    # Move the y-axis to x=0
    ax.spines['left'].set_position(('data', 0))  # Move the left spine to x=0

    # Customize the grid and ticks
    ax.xaxis.grid(True)
    ax.tick_params(axis='y', which='both', left=True, right=False)

    # Add sim values and country names at the end of the bars
    for i, bar in enumerate(bars):
        width = bar.get_width()
        country_name = countries[i]
        if width < 0:
            # Bar extends to the left
            ax.text(width, bar.get_y() + bar.get_height() / 2, f'{country_name} ({width:.3f})',
                    va='center', ha='right', color='black', fontsize=12)
        else:
            # Bar extends to the right
            ax.text(width, bar.get_y() + bar.get_height() / 2, f'({width:.3f}) {country_name}', 
                    va='center', ha='left', color='black', fontsize=12)

    # Show the plot
    plt.show()

plot_jsd_difference(jsd_country_df1, jsd_country_df2)

In [None]:
a = [0, 0.2, 0.8, 0]
b = [0.1, 0.2, 0.9, 0]
ans = jensenshannon(a, b)
ans

In [None]:
import scipy, copy
import numpy as np
from scipy.spatial.distance import jensenshannon

# for each country, calculate jsd for each question and add the similarity value to the list
def add_jsd_tolist(joint_dict_country_wvs_options):
    for country_dict in tqdm(joint_dict_country_wvs_options):
        country = list(country_dict.keys())[0]
        for ques_vals in country_dict[country]:
            question, dist1, dist2 = ques_vals[0], np.array(ques_vals[1]), np.array(ques_vals[2])
            similarity = 1 - jensenshannon(dist1, dist2)
            breakpoint()
            ques_vals.append(similarity)

    return joint_dict_country_wvs_options

joint_dict_df1 = copy.deepcopy(joint_dict_country_df1)
joint_dict_df2 = copy.deepcopy(joint_dict_country_df2)

jsd_df1 = add_jsd_tolist(joint_dict_df1)
jsd_df2 = add_jsd_tolist(joint_dict_df2)

In [None]:
# jsd_df1

In [None]:
# remove nan values
jsd_values_df1 = {country: [val[6] for val in jsd_dict[country] if not np.isnan(val[6])] for jsd_dict in jsd_df1 for country, jsd_vals in jsd_dict.items()}
jsd_values_df2 = {country: [val[6] for val in jsd_dict[country] if not np.isnan(val[6])] for jsd_dict in jsd_df2 for country, jsd_vals in jsd_dict.items()}

# # calculate the mean of jsd values per country
jsd_min_max_mean_df1 = {country: [np.min(val), np.max(val), np.mean(val)] for country, val in jsd_values_df1.items()}
jsd_min_max_mean_df2 = {country: [np.min(val), np.max(val), np.mean(val)] for country, val in jsd_values_df2.items()}

print(jsd_min_max_mean_df1)
print(jsd_min_max_mean_df2)

In [None]:
# split jsd_selected_df1 such that one has questions etc with similarity  <= 0.5 and other has > 0.5
# do this for all countries
low_threshold = 0.3
high_threshold = 0.8

def split_jsd(values):
    jsd_low_split = {}
    jsd_high_split = {}
    for country_dict in tqdm(values):
        country = list(country_dict.keys())[0]
        jsd_low = []; jsd_high = []; 
        for ques_vals in country_dict[country]:
            if ques_vals[6] <= low_threshold:
                breakpoint()
                jsd_low.append(ques_vals.copy())

            elif ques_vals[6] >= high_threshold:
                jsd_high.append(ques_vals.copy())
        
        jsd_low_split[country] = jsd_low
        jsd_high_split[country] = jsd_high
        
    return jsd_low_split, jsd_high_split

jsd_low_split_df1, jsd_high_split_df1 = split_jsd(jsd_df1)
jsd_low_split_df2, jsd_high_split_df2 = split_jsd(jsd_df2)

print(f" JSD values <= {low_threshold}: {jsd_low_split_df1}")

print(f"DF1: sum of values <= {low_threshold}: {sum([len(val) for val in jsd_low_split_df1.values()])}")
print(f"DF1: sum of values >= {high_threshold}: {sum([len(val) for val in jsd_high_split_df1.values()])}")

print(f"DF2: sum of values <= {low_threshold}: {sum([len(val) for val in jsd_low_split_df2.values()])}")
print(f"DF2: sum of values >= {high_threshold}: {sum([len(val) for val in jsd_high_split_df2.values()])}")

In [None]:
# jsd_low_split_df1['Iran']

In [None]:
# find the common questions in both low similarity for df1 and df2
# questions for first value of the list of lists

def get_common_questions(df1, df2):
    common_questions = {}
    for country in df1.keys():
        questions_df1 = [val[0] for val in df1[country]]
        questions_df2 = [val[0] for val in df2[country]]
        breakpoint()
        common_questions[country] = set(questions_df1).intersection(set(questions_df2))
    return common_questions

common_questions = get_common_questions(jsd_low_split_df1, jsd_low_split_df2)
common_questions_count = {key: len(value) for key, value in common_questions.items()}
print(common_questions_count)