In [None]:
# © 2025 Dayita Chaudhuri and Velagapudi Athul
# All rights reserved. Joint work.

In [4]:
from scoring import analyze_survey_alignment, compute_similarity_per_theme

### Indian States
Indian States of West Bengal, Telengana/Andhra Pradesh, Maharashtra, Delhi are considered for this analysis

In [None]:
import pandas as pd
from tqdm.auto import tqdm

total_results = {}

models = ['aya', 'llama3']
years = ['2022', '2012', '2006']
languages = ['en', 'te', 'bn', 'hi', 'mr']

total_iters = len(models) * len(years) * len(languages)

with tqdm(total=total_iters, desc="Processing All", unit="task") as pbar:
    for model in models:
        total_results[model] = {}
        for year in years:
            total_results[model][year] = {}
            for language in languages:
                results = analyze_survey_alignment(
                    year=year, 
                    country='india', 
                    region_wise=True, 
                    verbose=False, 
                    model=model,
                    language=language   
                )
                final_table = pd.DataFrame.from_dict(results, orient='index')
                final_table.index.name = 'Region'
                final_table.reset_index(inplace=True)

                final_table['State'] = final_table['Region'].apply(
                    lambda x: x.split()[0].split('-')[1].capitalize()
                )
                final_table['Model'] = model
                final_table['Year'] = year
                final_table['Language'] = language
                final_table = final_table[
                    ['State', 'Region', 'Model', 'Year', 'Language', 'soft_metric', 'hard_metric']
                ]
                total_results[model][year][language] = final_table
                pbar.update(1) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=rename_map_v_to_q, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=rename_map_v_to_q, inplace=True)
Processing All:  23%|██▎       | 7/30 [00:47<04:39, 12.13s/task]

In [None]:
import pandas as pd

combined_list = []
for model in total_results:
    for year in total_results[model]:
        for language in total_results[model][year]:
            combined_list.append(total_results[model][year][language])

df = pd.concat(combined_list, ignore_index=True)

tables = {}

for model in models:
    df_model = df[df['Model'] == model].copy()

    pivoted = df_model.pivot_table(
        index='State',
        columns='Language',
        values=['soft_metric', 'hard_metric'],
        aggfunc='first'
    )

    pivoted.columns = [f"{lang}_{metric.split('_')[0]}" for metric, lang in pivoted.columns]
    ordered_cols = []
    for lang in ['en', 'te', 'bn', 'hi', 'mr']:
        ordered_cols.append(f"{lang}_soft")
        ordered_cols.append(f"{lang}_hard")

    pivoted = pivoted[ordered_cols].reset_index()
    pivoted = pivoted.round(3)
    tables[model] = pivoted

print("Model: AYA")
aya_table = tables['aya']
print("Model: LLaMA3")
llama_table = tables['llama3']
aya_table, llama_table

(  State  en_soft  en_hard  te_soft  te_hard  bn_soft  bn_hard  hi_soft  \
 0    Br    0.671    0.310    0.682    0.332    0.723    0.327    0.707   
 1    Dl    0.688    0.322    0.686    0.375    0.714    0.359    0.735   
 2    Hr    0.676    0.306    0.649    0.347    0.647    0.291    0.659   
 3    Mh    0.671    0.313    0.669    0.372    0.702    0.332    0.696   
 4    Pb    0.658    0.332    0.643    0.325    0.664    0.322    0.667   
 5    Tg    0.719    0.357    0.678    0.362    0.716    0.354    0.720   
 6    Up    0.698    0.354    0.665    0.378    0.685    0.343    0.704   
 7    Wb    0.705    0.366    0.640    0.334    0.669    0.345    0.685   
 
    hi_hard  mr_soft  mr_hard  
 0    0.301    0.720    0.376  
 1    0.383    0.708    0.465  
 2    0.302    0.689    0.467  
 3    0.313    0.703    0.461  
 4    0.325    0.646    0.380  
 5    0.364    0.679    0.401  
 6    0.345    0.668    0.401  
 7    0.342    0.647    0.389  ,
   State  en_soft  en_hard  te_sof

## Countries

In [40]:
import pandas as pd
from tqdm.auto import tqdm

total_results = {}

models = ['aya']
years = ['2022']
languages = ['en', 'ru', 'hi', 'ja', 'ar', 'es']
countries = ['india', 'US', 'russia', 'japan', 'colombia', 'egypt']

total_iters = len(models) * len(years) * len(languages) * len(countries)

with tqdm(total=total_iters, desc="Processing All", unit="task") as pbar:
    for model in models:
        total_results[model] = {}
        for year in years:
            total_results[model][year] = {}
            for language in languages:
                total_results[model][year][language] = {}
                for country in countries:
                    try:
                        results = analyze_survey_alignment(
                            year=year, 
                            country=country, 
                            region_wise=False, 
                            verbose=False, 
                            model=model,
                            language=language   
                        )
                        combined = {
                            'Country': country,
                            'soft_metric': results.get('soft_metric'),
                            'hard_metric': results.get('hard_metric')
                        }
                    except Exception as e:
                        combined = {
                            'Country': country,
                            'Language': language,
                            'Year': year,
                            'soft_metric': 0.0,
                            'hard_metric': 0.0
                        }
                        
                    total_results[model][year][language][country] = combined
                    pbar.update(1) 

Processing All: 100%|██████████| 36/36 [00:16<00:00,  2.19task/s]


In [41]:
combined_list = [
    {
        **total_results[m][y][l][c],
        "Model": m,
        "Year": y,
        "Language": l,
    }
    for m in total_results
    for y in total_results[m]
    for l in total_results[m][y]
    for c in total_results[m][y][l]
]

df = pd.DataFrame(combined_list)
pivoted = (
    df.pivot_table(
        index="Country",
        columns="Language",
        values=["soft_metric", "hard_metric"],
        aggfunc="first",
    )
)

pivoted.columns = [f"{lang}_{metric.split('_')[0]}" for metric, lang in pivoted.columns]
ordered_cols = [f"{lang}_{m}" for lang in languages for m in ("soft", "hard")]
pivoted = pivoted.reindex(columns=ordered_cols).reset_index()

pivoted

Unnamed: 0,Country,en_soft,en_hard,ru_soft,ru_hard,hi_soft,hi_hard,ja_soft,ja_hard,ar_soft,ar_hard,es_soft,es_hard
0,US,0.734679,0.426064,0.750505,0.429477,0.754904,0.426836,0.770497,0.462229,0.740039,0.435288,0.744805,0.44374
1,colombia,0.677347,0.33574,0.684377,0.292517,0.686229,0.277211,0.756883,0.377551,0.68793,0.30017,0.668452,0.293367
2,egypt,0.682506,0.328901,0.689747,0.308806,0.568107,0.212304,0.693573,0.299156,0.693808,0.293124,0.615923,0.267793
3,india,0.689275,0.336207,0.666316,0.328887,0.717546,0.343743,0.705417,0.339748,0.678561,0.343369,0.705758,0.421494
4,japan,0.754589,0.439204,0.751574,0.423208,0.736324,0.424061,0.792529,0.469283,0.750953,0.440273,0.758845,0.444539
5,russia,0.691741,0.352107,0.70263,0.294118,0.721831,0.310355,0.745959,0.351489,0.708898,0.323121,0.695964,0.323404
