In [None]:
import os
import pandas as pd

Read files


In [None]:
# read files from output directory
output_dir = '../output'
output_files = os.listdir(output_dir)
output_files 

In [None]:
import shutil
img_comparison = True
img_country_comparison = False

if img_comparison:
    file_1 = os.path.join(output_dir, 'llava_img_True_country_False.csv')
    file_2 = os.path.join(output_dir, 'llava_img_False_country_False.csv')
    image_dir = '../jsd_images_true_country_false'
elif img_country_comparison:
    file_1 = os.path.join(output_dir, 'llava_img_True_country_True.csv')
    file_2 = os.path.join(output_dir, 'llava_img_False_country_True.csv')
    image_dir = '../jsd_images_true_country_true'


# delete all files in the directory if exists
if os.path.exists(image_dir):
    shutil.rmtree(image_dir)
# os.makedirs(image_dir)

df_1 = pd.read_csv(file_1)
df_2 = pd.read_csv(file_2)

In [None]:
import ast

def convert_to_list(x):
    x = ast.literal_eval(x) 
    x_list = list(x.values()) if isinstance(x, dict) else list(x)
    return x_list

# load column options_prob and wvs_distribution
options_prob_1_prob = df_1['options_prob']
options_prob_1_dict = [convert_to_list(val) for val in options_prob_1_prob]
options_prob = pd.Series(options_prob_1_dict)
df_1['options_prob_list'] = options_prob

options_prob_2_prob = df_2['options_prob']
options_prob_2_dict = [convert_to_list(val) for val in options_prob_2_prob]
options_prob_2 = pd.Series(options_prob_2_dict)
df_2['options_prob_list'] = options_prob_2

wvs_distribution_1_prob = df_1['wvs_distribution']	
wvs_distribution_1 = [convert_to_list(val) for val in wvs_distribution_1_prob]
df_1['wvs_distribution_list'] = wvs_distribution_1

wvs_distribution_2_prob = df_2['wvs_distribution']
wvs_distribution_2 = [convert_to_list(val) for val in wvs_distribution_2_prob]
df_2['wvs_distribution_list'] = wvs_distribution_2

In [None]:
from datasets import load_dataset

go_dataset = load_dataset("Anthropic/llm_global_opinions")

# load as a pandas dataframe where 'source' == wvs
go_dataset_df = go_dataset['train'].to_pandas()
go_dataset_wvs = go_dataset_df[go_dataset_df['source'] == 'WVS']
go_dataset_wvs.head()

In [None]:
# Initialize the new column with empty dictionaries

def create_joint_dict(questions, df):
    df['country_response_dict'] = [{} for _ in range(len(df))]

    for question in questions:
        df_question = df[df['question'] == question]
        country_prob_dict = dict(zip(df_question['country'], df_question['options_prob_list']))
        country_images_dict = dict(zip(df_question['country'], df_question['image_id']))
        country_wvs_dict = dict(zip(df_question['country'], df_question['wvs_distribution_list']))

        df.loc[df['question'] == question, 'country_prob_dict'] = [country_prob_dict] * len(df_question)
        df.loc[df['question'] == question, 'country_images_dict'] = [country_images_dict] * len(df_question)
        df.loc[df['question'] == question, 'country_wvs_dict'] = [country_wvs_dict] * len(df_question)

    # Make a new DataFrame
    # keep columns: question, country, country_prob_dict, country_images_dict, country_wvs_dict
    df_aggregated = df[['question', 'country', 'country_prob_dict', 'country_images_dict', 'country_wvs_dict']]
    # drop duplicate rows in 'question' column and drop colunms: 'country'
    df_aggregated = df_aggregated.drop_duplicates(subset='question').drop(columns='country')
    print(df_aggregated.shape)
    return df_aggregated
    
assert df_1['question'].nunique() == df_2['question'].nunique()
assert df_1['question'].unique().all() == df_2['question'].unique().all()

questions = df_1['question'].unique()
df_aggregated_1 = create_joint_dict(questions, df_1)
df_aggregated_2 = create_joint_dict(questions, df_2)

In [None]:
wvs_questions = go_dataset_wvs['question'].unique()
model_questions = df_aggregated_1['question'].unique()

common_questions = set(wvs_questions).intersection(set(model_questions))
len(common_questions)

# for all unique value in questions column, find "options" value from go_dataset_wvs and add to df_aggregated
def add_options_to_df(questions, df_aggregated, go_dataset_wvs):
    for question in questions:
        options = go_dataset_wvs[go_dataset_wvs['question'] == question]['options'].values[0]
        # create a new column with options
        # breakpoint()

        df_aggregated.loc[df_aggregated['question'] == question, 'options'] = options
    return df_aggregated

df_aggregated_1 = add_options_to_df(common_questions, df_aggregated_1, go_dataset_wvs)
df_aggregated_2 = add_options_to_df(common_questions, df_aggregated_2, go_dataset_wvs)

In [None]:
a = [0, 0.2, 0.8, 0]
b = [0.8, 0.2, 0.0, 0]
ans = jensenshannon(a, b)
ans

In [None]:
import scipy
import numpy as np
from scipy.spatial.distance import jensenshannon

# calculate jensen shannon divergence between values of two dictionaries
def jensen_shannon_divergence(p, q, options=None):
    jsd_list = []
    for country in p.keys():
        p_array = np.array(p[country])
        q_array = np.array(q[country])
        jsd_value = jensenshannon(p_array, q_array)
        breakpoint()

        jsd_list.append({'country': country, 'jsd': jsd_value, 'options': options})
    return jsd_list


df_aggregated_1['jsd'] = [jensen_shannon_divergence(p, q, options) for p, q, options in zip(df_aggregated_1['country_prob_dict'], df_aggregated_1['country_wvs_dict'], df_aggregated_1['options'])]
df_aggregated_2['jsd'] = [jensen_shannon_divergence(p, q, options) for p, q, options in zip(df_aggregated_2['country_prob_dict'], df_aggregated_2['country_wvs_dict'], df_aggregated_2['options'])]

In [None]:
# print df_aggregated_1 in non truncated form
pd.set_option('display.max_colwidth', None)

# print column: question, jsd
df_aggregated_1[['question', 'country_prob_dict', 'country_wvs_dict', 'jsd']].head(1)

In [None]:
# Plot a bar chart of comparison of jsd values for df_aggregated_1 and df_aggregated_2 for each question
import matplotlib.pyplot as plt
import shutil
from tqdm import tqdm



def plot_jsd(df_1, df_2, question):
    jsd_1 = df_1[df_1['question'] == question]['jsd'].values[0]
    jsd_2 = df_2[df_2['question'] == question]['jsd'].values[0]

    breakpoint()

    countries = [jsd['country'] for jsd in jsd_1]
    jsd_values_1 = [jsd['jsd'] for jsd in jsd_1]
    jsd_values_2 = [jsd['jsd'] for jsd in jsd_2]

    # sometimes there are 42 bar plots when len(countries) is 42
    # then we need to increase the size of the plot
    if len(countries) > 10:
        fig, ax = plt.subplots(figsize=(80, 20))
    else:
        fig, ax = plt.subplots(figsize=(40, 20))
    # add jsd value on both bars
    for i, jsd_value in enumerate(jsd_values_1):
        ax.text(i, jsd_value, f'{jsd_value:.2f}', ha='center', va='bottom', fontsize=30)
    for i, jsd_value in enumerate(jsd_values_2):
        # allign a little bit to the right
        ax.text(i + 0.35, jsd_value, f'{jsd_value:.2f}', ha='center', va='bottom', fontsize=30)


    bar_width = 0.30
    # there a white space between origin and first bar, 
    index = np.arange(len(countries))
    plt.bar(index, jsd_values_1, bar_width, label='with_image')
    plt.bar(index + bar_width, jsd_values_2, bar_width, label='without_image')

    plt.xlabel('Countries', fontsize=40)
    plt.ylabel('JSD', fontsize=40)
    plt.title(f'Q: {question}', fontsize=60)
    # labels are not alligned properly, so shift them a little bit to the left
    plt.xticks(index-0.4, countries, fontsize=40, rotation=45)
    # y_limit should be b/w 0 and 1 with step of 0.1
    plt.yticks(np.arange(0, 1.1, 0.1), fontsize=40)
    plt.legend(loc='upper right', fontsize=50)
    # draw a line at y=0.5
    plt.axhline(y=0.5, color='r', linestyle='--')
    plt.tight_layout()
    # file name should be first 10 words of question using _ as separator
    file_name = '_'.join(question.split()[:15]).replace('/', '_')
    plt.savefig(f'{image_dir}/{file_name}.png')
    plt.close()

# df_aggregated_1_test = df_aggregated_1[:10]
# df_aggregated_2_test = df_aggregated_2[:10]
questions_list = df_aggregated_1['question'].unique()
questions = sorted(questions_list)
for question in tqdm(questions_list):
    plot_jsd(df_aggregated_1, df_aggregated_2, question)

In [None]:
# find the questions where difference between jsd values of df_aggregated_1 and df_aggregated_2 
def find_diff(df_aggregated_1, df_aggregated_2):
    diff_list = []
    for question in questions_list:
        jsd_1 = df_aggregated_1[df_aggregated_1['question'] == question]['jsd'].values[0]
        jsd_2 = df_aggregated_2[df_aggregated_2['question'] == question]['jsd'].values[0]

        jsd_values_1 = [jsd['jsd'] for jsd in jsd_1]
        jsd_values_2 = [jsd['jsd'] for jsd in jsd_2]

        countries = [jsd['country'] for jsd in jsd_1]
        diff = [abs(jsd1 - jsd2) for jsd1, jsd2 in zip(jsd_values_1, jsd_values_2)]
        # if diff >0.4 for any country, add that country, question and diff to diff_list
        for i, d in enumerate(diff):
            if d > 0.20:
                diff_list.append({'question': question, 'country': countries[i], 'diff': d, 'jsd_1': jsd_values_1[i], 'jsd_2': jsd_values_2[i]})
    return diff_list

diff_list = find_diff(df_aggregated_1, df_aggregated_2)

len(diff_list)


In [None]:
# make a dataframe of diff_list
diff_df = pd.DataFrame(diff_list)
diff_df.tail(20)

In [None]:
# make a list of jsd values for df_ggregated_1 and df_aggregated_2 for question for all countries

all_jsd_values = []

def jsd_values(country, df_1, df_2):
    all_jsd_country = {}
    jsd_1 = df_1[df_1['question'] == question]['jsd'].values[0]
    jsd_2 = df_2[df_2['question'] == question]['jsd'].values[0]
    jsd_values_1 = [jsd['jsd'] for jsd in jsd_1]
    jsd_values_2 = [jsd['jsd'] for jsd in jsd_2]
    all_jsd_country['country'] = {'question': question, 'jsd_values_1': jsd_values_1, 'jsd_values_2': jsd_values_2}
    return all_jsd_country

for question in questions_list:
    all_jsd_values.append(jsd_values(question, df_aggregated_1, df_aggregated_2))


