In [8]:
import json
import pandas as pd

with open("../data/translated_questions/questions_en.json", "r") as f:
    questions = json.load(f)
with open("../data/translated_questions/questions_te.json", "r") as f:
    questions_te = json.load(f)
with open("../data/chosen_cols_updated.json", "r") as f:
    chosen_cols = json.load(f)['chosen_cols']
df = pd.read_csv("../gemma_responses/most_frequent_answers_allstates_india_en.csv")

for q in questions:
    if chosen_cols[q] == True and q in df.columns and q not in questions_te:
        print(q)

In [3]:
import pandas as pd
import json

df = pd.read_csv("../gemma_responses/most_frequent_answers_allstates_india_en.csv")
for region, group in df.groupby('region'):
    filename = f"../gemma_responses/most_frequent_answers_{region}_en.csv"
    group.to_csv(filename, index=False)

In [1]:
from scoring import analyze_survey_alignment, compute_similarity_per_theme
import pandas as pd
import json

### Indian States
Indian States of West Bengal, Telengana/Andhra Pradesh, Maharashtra, Delhi are considered for this analysis

In [3]:
import pandas as pd

# Define states and their regional languages
languages = {
    'telangana': 'te',
    'bengal': 'bn',
    'delhi': 'hi',
    'maharashtra': 'mr'
}

# Define years to process
years = ['2022', '2012', '2006']

# Store all results here
all_results = {}

# Loop across years and states
for year in years:
    print(f"\n--- Processing Year: {year} ---")
    yearly_results = {}

    for state in languages.keys():
        try:
            # English results
            en_results = analyze_survey_alignment(
                mode='state',
                year=year,
                state=state,
                model='gemma',
                region_wise=False,
                verbose=False
            )

            # Regional language results
            ln_results = analyze_survey_alignment(
                mode='state',
                year=year,
                state=state,
                model='gemma',
                language=languages[state],
                region_wise=False,
                verbose=False
            )

            yearly_results[state] = {
                'State': state,
                'soft_metric_en': en_results.get('soft_metric', None),
                'hard_metric_en': en_results.get('hard_metric', None),
                'soft_metric_ln': ln_results.get('soft_metric', None),
                'hard_metric_ln': ln_results.get('hard_metric', None)
            }

        except Exception as e:
            print(f"⚠️ Error processing {state} ({year}): {e}")
            continue

    # Convert to DataFrame for this year
    yearly_df = pd.DataFrame.from_dict(yearly_results, orient='index')
    all_results[year] = yearly_df
    print(f"\n✅ Completed Year {year}")
    display(yearly_df)

# Optionally, combine all years into a single table with a "Year" column
combined_df = pd.concat(
    [df.assign(Year=year) for year, df in all_results.items()],
    ignore_index=True
)

display(combined_df)



--- Processing Year: 2022 ---

✅ Completed Year 2022


Unnamed: 0,State,soft_metric_en,hard_metric_en,soft_metric_ln,hard_metric_ln
telangana,telangana,0.671702,0.312036,0.701273,0.36714
bengal,bengal,0.655924,0.334004,0.654505,0.306849
delhi,delhi,0.677164,0.36403,0.677769,0.325984
maharashtra,maharashtra,0.596114,0.296593,0.562756,0.24581



--- Processing Year: 2012 ---

✅ Completed Year 2012


Unnamed: 0,State,soft_metric_en,hard_metric_en,soft_metric_ln,hard_metric_ln
telangana,telangana,0.726855,0.412844,0.700168,0.131439
bengal,bengal,0.662845,0.340237,0.462121,0.085227
delhi,delhi,0.701863,0.264095,0.553674,0.069892
maharashtra,maharashtra,0.660152,0.333636,0.600239,0.078853



--- Processing Year: 2006 ---

✅ Completed Year 2006


Unnamed: 0,State,soft_metric_en,hard_metric_en,soft_metric_ln,hard_metric_ln
telangana,telangana,0.70113,0.368421,0.44297,0.079646
bengal,bengal,0.692748,0.359281,0.197133,0.010753
delhi,delhi,0.596078,0.305882,0.03366,0.0
maharashtra,maharashtra,0.643791,0.258824,0.206209,0.023529


Unnamed: 0,State,soft_metric_en,hard_metric_en,soft_metric_ln,hard_metric_ln,Year
0,telangana,0.671702,0.312036,0.701273,0.36714,2022
1,bengal,0.655924,0.334004,0.654505,0.306849,2022
2,delhi,0.677164,0.36403,0.677769,0.325984,2022
3,maharashtra,0.596114,0.296593,0.562756,0.24581,2022
4,telangana,0.726855,0.412844,0.700168,0.131439,2012
5,bengal,0.662845,0.340237,0.462121,0.085227,2012
6,delhi,0.701863,0.264095,0.553674,0.069892,2012
7,maharashtra,0.660152,0.333636,0.600239,0.078853,2012
8,telangana,0.70113,0.368421,0.44297,0.079646,2006
9,bengal,0.692748,0.359281,0.197133,0.010753,2006


## Countries
Countries of Japan, US, Russia are considered for this analysis

In [None]:
import pandas as pd

# Define country → language mapping
languages = {
    'japan': 'ja',
    'US': 'en',
    'russia': 'ru'
}

# Define years to evaluate
years = ['2022', '2012', '2006']

# Dictionary to store results for each year
all_results = {}

for year in years:
    print(f"\n--- Processing Year: {year} ---")
    yearly_results = {}

    for country, lang in languages.items():
        try:
            # English results
            en_results = analyze_survey_alignment(
                mode='country',
                year=year,
                country=country,
                model='gemma',
                region_wise=False,
                verbose=False
            )

            # Regional language (native) results
            ln_results = analyze_survey_alignment(
                mode='country',
                year=year,
                country=country,
                model='gemma',
                language=lang,
                region_wise=False,
                verbose=False
            )

            yearly_results[country] = {
                'Country': country,
                'soft_metric_en': en_results.get('soft_metric', None),
                'hard_metric_en': en_results.get('hard_metric', None),
                'soft_metric_ln': ln_results.get('soft_metric', None),
                'hard_metric_ln': ln_results.get('hard_metric', None)
            }

        except Exception as e:
            print(f"⚠️ Error processing {country} ({year}): {e}")
            continue

    # Create DataFrame for this year
    yearly_df = pd.DataFrame.from_dict(yearly_results, orient='index')
    all_results[year] = yearly_df

    print(f"\n✅ Completed {year}")
    display(yearly_df)

# Combine all years into a single DataFrame for easier analysis
combined_df = pd.concat(
    [df.assign(Year=year) for year, df in all_results.items()],
    ignore_index=True
)

# Reorder columns for readability
combined_df = combined_df[['Year', 'Country', 'soft_metric_en', 'hard_metric_en', 'soft_metric_ln', 'hard_metric_ln']]

display(combined_df)



--- Processing Year: 2022 ---

✅ Completed 2022


Unnamed: 0,Country,soft_metric_en,hard_metric_en,soft_metric_ln,hard_metric_ln
japan,japan,0.776615,0.426676,0.781745,0.357553
US,US,0.784094,0.478719,0.784094,0.478719
russia,russia,0.69966,0.360947,0.698065,0.285501



--- Processing Year: 2012 ---


  wvs_df['urban_rural'] = [['urban', 'rural']] * len(wvs_df)
  wvs_df['urban_rural'] = [['urban', 'rural']] * len(wvs_df)
