# **Setup**

In [None]:
%load_ext autoreload
%autoreload 2

### Import

In [None]:
!pip install transformers
!pip install python-Levenshtein

In [None]:
import numpy as np
import pandas as pd
import torch
import Levenshtein
import ast
import string
import scipy
import math
import transformers as tf
from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
from transformers import T5Tokenizer, T5ForConditionalGeneration

### Drive Path

Run the following cell to mount Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Enter the path for the project files in drive (for example, /content/drive/MyDrive/proj)

In [None]:
%cd '/content/drive/MyDrive/CS263/proj'

/content/drive/MyDrive/CS263/proj


### Device Setup

Make sure to change runtime to T4 to use GPU

In [None]:
USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
elif USE_GPU and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(f"USING {device}")

USING cpu


# **Generation**

### Data

In [None]:
df = pd.read_csv("data/final_dataset.csv")
df.head()

Unnamed: 0,country,wvs_q,query,options,culture,language,religion,ground
0,USA,27,To what extent do you agree with the statement...,"{1: 'Agree strongly', 2: 'Agree', 3: 'Disagree...",American,English,"Christianity, especially Protestantism","{""1"": 31.1, ""2"": 49.4, ""3"": 16.4, ""4"": 2.6}"
1,China,27,To what extent do you agree with the statement...,"{1: 'Agree strongly', 2: 'Agree', 3: 'Disagree...",Han,Mandarin Chinese,Confucianism and Buddhism,"{""1"": 23.3, ""2"": 52.6, ""3"": 21.5, ""4"": 2.4}"
2,Iran,27,To what extent do you agree with the statement...,"{1: 'Agree strongly', 2: 'Agree', 3: 'Disagree...",Persian,Persian (Farsi),Shia Islam,"{""1"": 49.2, ""2"": 47.4, ""3"": 2.7, ""4"": 0.7}"
3,Kenya,27,To what extent do you agree with the statement...,"{1: 'Agree strongly', 2: 'Agree', 3: 'Disagree...",Kikuyu,Swahili,Christianity,"{""1"": 66.4, ""2"": 27.8, ""3"": 4.3, ""4"": 0.8}"
4,,27,To what extent do you agree with the statement...,"{1: 'Agree strongly', 2: 'Agree', 3: 'Disagree...",none,none,none,


dataframe_amt is the amount of rows to use from the dataframe (mainly for testing). generation_amt is the number of generations to get our distribution

In [None]:
dataframe_amt = 75
generation_amt = 200
df = df.head(dataframe_amt)

In [None]:
df['options'] = df['options'].apply(ast.literal_eval)
df['ground'] = df['ground'].apply(lambda x: ast.literal_eval(x) if x == x else x)

Find closest option index that the result aligns to

In [None]:
def get_ind(options, answer):
  translator = str.maketrans('', '', string.punctuation)
  answer = answer.translate(translator)
  if answer.isdigit() and int(answer) in options:
    return int(answer)
  answer = answer.lower()

  shortest = 3
  ind = None
  for i in options.keys():
    option = options[i]
    distance = Levenshtein.distance(option.lower(), answer)
    if distance < shortest:
        shortest = distance
        ind = i
  if shortest == 3:
    return None
  else:
    return ind

Convert raw list of results to distribution

In [None]:
def results_distribution(results, options):
  dct = dict(zip(options.keys(), [0] * len(options)))
  for result in results:
    if result in dct:
      dct[result] += 1
  factor = 1.0 / sum(dct.values())
  return {key: value * factor for key, value in dct.items() }

### T5


In [None]:
def get_T5_context(country, culture, language, religion):
  if pd.isna(country):
    return "For the following question, select exactly one option. Do not include any extra commentary."
  return ("Imagine you are from the " + country + " whose native language is " + language + "."
    "You were born and raised here within the " + culture+ " culture and practice " + religion + "."
    "Answer the following question from this perspective. Select exactly one option. Do not include any extra commentary.")

In [None]:
def generate_T5(context, question, options, num_generation=1):
    tokenizer = tf.AutoTokenizer.from_pretrained("google/flan-t5-base")
    model = tf.AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

    input_text = f"context: {context} question: {question} options: {options}"

    # Encode the prompt into tensor tokens
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids

    # Generate the output using the model
    top_k_outputs = model.generate(
        input_ids,
        do_sample=True,
        top_k=50,
        num_return_sequences=num_generation,
        max_length=5,
        temperature=0.7  # Adjust temperature to increase randomness
    )

    outputs = [tokenizer.decode(i, skip_special_tokens=True) for i in top_k_outputs]

    # Decode the generated id to text
    return [get_ind(options, output) for output in outputs]

In [None]:
df['T5_context'] = df.apply(lambda x: get_T5_context(x['country'], x['culture'], x['language'], x['religion']), axis=1)
df['T5_output'] = df.apply(lambda x: generate_T5(x['T5_context'], x['query'], x['options'], generation_amt), axis=1)
df['T5_output'].head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


0    [2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, ...
1    [1, 2, 3, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 1, 2, ...
2    [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 2, 2, 1, ...
3    [1, 2, 2, 2, 3, 3, 2, 2, 3, 2, 2, 1, 2, 2, 2, ...
4    [2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, ...
Name: T5_output, dtype: object

In [None]:
df['T5_distribution'] = df.apply(lambda x: results_distribution(x['T5_output'], x['options']), axis=1)
df['T5_distribution'].head()

0    {1: 0.18090452261306533, 2: 0.7788944723618091...
1    {1: 0.22110552763819097, 2: 0.7035175879396985...
2               {1: 0.215, 2: 0.675, 3: 0.08, 4: 0.03}
3               {1: 0.225, 2: 0.675, 3: 0.08, 4: 0.02}
4    {1: 0.22613065326633167, 2: 0.7035175879396985...
Name: T5_distribution, dtype: object

### Bart

In [None]:
def get_bart_context(country, culture, language, religion):
  if pd.isna(country):
    return "[]"
  return ("[Country: " + country + ", Language: " + language + ", Culture: " + culture + ", Religion: " + religion +"]")

In [None]:

def generate_bart(context, question, options):
    # Initialize zero-shot classification pipeline
    classifier = tf.pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

    input_text = f"context: {context} question: {question}"

    # Perform zero-shot classification
    result = classifier(input_text, candidate_labels=list(options.values()), multi_label=True)

    dct = dict(zip(result['labels'], result['scores']))
    dct = {list(options.keys())[list(options.values()).index(key)]: value for key, value in dct.items()}

    factor = 1.0 / sum(dct.values())
    for key in dct:
      dct[key] = dct[key]*factor
    return dct


df['bart_context'] = df.apply(lambda x: get_bart_context(x['country'], x['culture'], x['language'], x['religion']), axis=1)
df['bart_distribution'] = df.apply(lambda x: generate_bart(x['bart_context'], x['query'], x['options']), axis=1)
df['bart_distribution'].head()

0    {2: 0.6097378175543802, 1: 0.2869598213634336,...
1    {2: 0.5497419550076191, 1: 0.34252183144144654...
2    {2: 0.5671487570680173, 1: 0.32552000420718624...
3    {2: 0.5766047305196125, 1: 0.32189608848070916...
4    {2: 0.6306777948719335, 1: 0.3190395766305648,...
Name: bart_distribution, dtype: object

# **Evaluation**

### Ground Truth Format

Convert index to int to be the same, and divide by 100 to get proportion

In [None]:
df['ground'] = df['ground'].apply(lambda x: {int(key) : (value / 100) for key, value in x.items()} if x == x else float('nan'))

### To CSV

In [None]:
df.to_csv('results.csv')

### Load dataset

In [None]:
df = pd.read_csv('results.csv')
df['options'] = df['options'].apply(ast.literal_eval)
df['T5_distribution'] = df['T5_distribution'].apply(ast.literal_eval)
df['bart_distribution'] = df['bart_distribution'].apply(ast.literal_eval)
df['ground'] = df['ground'].apply(lambda x: ast.literal_eval(x) if x == x else x)

In [None]:
#1 nocontext generation -> 4 ground truths
#2 nocontext generation -> 4 context generated
#3 4 context generated -> respective ground truths
filtered_df = df[df['ground'].notnull()]
no_context = df[df['ground'].isna()]

## T5

In [None]:
merged_t5 = filtered_df.merge(no_context[['query', 'T5_distribution']], on='query', suffixes=('', '_nocontext'))

### Jensen Shannon

In [None]:
def jensen_shannon(a, b):
    arr_a = []
    arr_b = []
    for i in b.keys():
      arr_a.append(a[i])
      arr_b.append(b[i])
    return scipy.spatial.distance.jensenshannon(arr_a, arr_b)

In [None]:
merged_t5['jensen_T5_1'] = merged_t5.apply(lambda x: jensen_shannon(x['ground'], x['T5_distribution_nocontext']), axis=1)
merged_t5['jensen_T5_2'] = merged_t5.apply(lambda x: jensen_shannon(x['T5_distribution'], x['T5_distribution_nocontext']), axis=1)
merged_t5['jensen_T5_3'] = merged_t5.apply(lambda x: jensen_shannon(x['ground'], x['T5_distribution']), axis=1)

In [None]:
merged_t5[['jensen_T5_1', 'jensen_T5_2', 'jensen_T5_3']]

Unnamed: 0,jensen_T5_1,jensen_T5_2,jensen_T5_3
0,0.164067,0.069129,0.228237
1,0.17608,0.030373,0.178409
2,0.199635,0.059248,0.217762
3,0.324171,0.041003,0.322149
4,0.259338,0.029268,0.266418
5,0.046796,0.054823,0.09326
6,0.103005,0.046899,0.067612
7,0.285544,0.053163,0.24322
8,0.370315,0.026777,0.365578
9,0.16372,0.073088,0.165882


Answers without context actually most align with Iran and Least with the US, Answers with context still best align with Iran but align worst with China. However when we compare the answers between the question without context it most aligns with US and least with China and Iran

In [None]:
merged_t5.groupby('country')[['jensen_T5_1', 'jensen_T5_2', 'jensen_T5_3']].mean()

Unnamed: 0_level_0,jensen_T5_1,jensen_T5_2,jensen_T5_3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,0.32121,0.089766,0.33946
Iran,0.279995,0.093004,0.288449
Kenya,0.314036,0.080192,0.322098
USA,0.339639,0.078487,0.329361


### Earth Mover's Distance

In [None]:
def wasserstein_distance(a, b):
    arr_a = []
    arr_b = []
    for i in b.keys():
      arr_a.append(a[i])
      arr_b.append(b[i])
    return scipy.stats.wasserstein_distance(arr_a, arr_b)

merged_t5['emd_T5_1'] = merged_t5.apply(lambda x: wasserstein_distance(x['ground'], x['T5_distribution_nocontext']), axis=1)
merged_t5['emd_T5_2'] = merged_t5.apply(lambda x: wasserstein_distance(x['T5_distribution'], x['T5_distribution_nocontext']), axis=1)
merged_t5['emd_T5_3'] = merged_t5.apply(lambda x: wasserstein_distance(x['ground'], x['T5_distribution']), axis=1)
merged_t5.groupby('country')[['emd_T5_1', 'emd_T5_2', 'emd_T5_3']].mean()

Unnamed: 0_level_0,emd_T5_1,emd_T5_2,emd_T5_3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,0.081354,0.027693,0.096155
Iran,0.09159,0.039601,0.09898
Kenya,0.075732,0.0292,0.08133
USA,0.080325,0.026808,0.074073


### Hellinger Distance

In [None]:
def hellinger(p,q):
    """Hellinger distance between distributions"""
    return sum([(math.sqrt(t[0])-math.sqrt(t[1]))*(math.sqrt(t[0])-math.sqrt(t[1]))\
                for t in zip(p,q)])/math.sqrt(2.)

def hellinger_distance(a, b):
    arr_a = []
    arr_b = []
    for i in b.keys():
      arr_a.append(a[i])
      arr_b.append(b[i])
    return hellinger(arr_a, arr_b)

merged_t5['hell_T5_1'] = merged_t5.apply(lambda x: hellinger_distance(x['ground'], x['T5_distribution_nocontext']), axis=1)
merged_t5['hell_T5_2'] = merged_t5.apply(lambda x: hellinger_distance(x['T5_distribution'], x['T5_distribution_nocontext']), axis=1)
merged_t5['hell_T5_3'] = merged_t5.apply(lambda x: hellinger_distance(x['ground'], x['T5_distribution']), axis=1)
merged_t5.groupby('country')[['hell_T5_1', 'hell_T5_2', 'hell_T5_3']].mean()

Unnamed: 0_level_0,hell_T5_1,hell_T5_2,hell_T5_3
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
China,0.194312,0.014121,0.204437
Iran,0.174354,0.013783,0.169902
Kenya,0.164694,0.011221,0.164855
USA,0.209449,0.010427,0.195011


## Bart

In [None]:
merged_bart = filtered_df.merge(no_context[['query', 'bart_distribution']], on='query', suffixes=('', '_nocontext'))

In [None]:
def wasserstein_distance(a, b):
    arr_a = []
    arr_b = []
    for i in sorted(list(b.keys())):
      arr_a.append(a[i])
      arr_b.append(b[i])
    return scipy.stats.wasserstein_distance(arr_a, arr_b)

merged_bart['emd_bart_1'] = merged_bart.apply(lambda x: wasserstein_distance(x['ground'], x['bart_distribution_nocontext']), axis=1)
merged_bart['emd_bart_2'] = merged_bart.apply(lambda x: wasserstein_distance(x['bart_distribution'], x['bart_distribution_nocontext']), axis=1)
merged_bart['emd_bart_3'] = merged_bart.apply(lambda x: wasserstein_distance(x['ground'], x['bart_distribution']), axis=1)
merged_bart.groupby('country')[['emd_bart_1', 'emd_bart_2', 'emd_bart_3']].mean()