In [10]:
!pip install openpyxl xlrd

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting xlrd
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: xlrd, et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5 xlrd-2.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
from scoring import process_questions_config, normalize_text, normalize_text, get_demographic_mapping, map_demographics, compute_metrics, replace_answers, load_and_normalize_csv
import json
import pandas as pd
import numpy as np

In [None]:
def analyze_survey_alignment(country='india', state='bengal', language='en', model='llama', region_wise=False, verbose=True):
    """
    Analyze survey alignment between WVS data and model responses.
    1. Load and normalize data
    2. Process WVS data
    3. Replace answers with numeric codes
    4. Map demographics
    5. Ensure merge columns exist
    6. Merge datasets
    7. Compute metrics 
    """
        
    wvs_filepath_2022=f'../data/{country}/2022/2022_{country}_majority_answers_by_persona_{language}.csv'
    wvs_filepath_2012=f'../data/{country}/2012/2012_{country}_majority_answers_by_persona_{language}.csv'
    questions_filepath=f'../data/translated_questions/questions_{language}.json'
    config_file='../data/chosen_cols_updated.json'
    mapping_file = '../data/qsns_mapping.json'
    
    answer_mappings_by_q, num_options_map = process_questions_config(questions_filepath)
    flat_answer_mapping = {normalize_text(k): v for q_map in answer_mappings_by_q.values() for k,v in q_map.items()}
    
    wvs_df_2022 = load_and_normalize_csv(wvs_filepath_2022)
    wvs_df_2012 = load_and_normalize_csv(wvs_filepath_2012)
    
    # Rename column names
    rename_map = {col: col.split(':')[0].strip() for col in wvs_df_2022.columns if ':' in col}
    wvs_df_2022.rename(columns=rename_map, inplace=True)
    rename_map = {col: col.split(':')[0].strip() for col in wvs_df_2012.columns if ':' in col}
    wvs_df_2012.rename(columns=rename_map, inplace=True)
    
    
    # Get demographic mappings
    demographic_mapping_2022 = get_demographic_mapping(country=country)
    demographic_mapping_2012 = get_demographic_mapping(country=country, year='2012')
    wvs_df_2022.rename(columns=demographic_mapping_2022, inplace=True)
    wvs_df_2012.rename(columns=demographic_mapping_2012, inplace=True)
    
    # Specific mapping of demographics
    wvs_df_2022 = map_demographics(wvs_df_2022, country, year='2022')
    wvs_df_2012 = map_demographics(wvs_df_2012, country, year='2012')
    
    if True:
        with open(mapping_file, 'r') as f:
            qsns_mapping_data = json.load(f)
        qsns_mapping = qsns_mapping_data['2012']
        valid_columns = set()
        rename_map_v_to_q = {}
        for col in wvs_df_2012.columns:
            if col in qsns_mapping and qsns_mapping[col] is not None:
                rename_map_v_to_q[col] = qsns_mapping[col]
                valid_columns.add(col)
            elif col in demographic_mapping_2012.values():
                valid_columns.add(col)
        wvs_df_2012 = wvs_df_2012[list(valid_columns)]
        wvs_df_2012.rename(columns=rename_map_v_to_q, inplace=True)
    
    # Get chosen questions
    with open(config_file, "r") as f:
        data = json.load(f)
    chosen_questions = [q for q, k in data['chosen_cols'].items() if k == True]
    selected_questions = [q for q in chosen_questions if q in wvs_df_2022.columns and q in wvs_df_2012.columns]
    
    # Replace answers with numeric codes
    wvs_df_2022 = replace_answers(wvs_df_2022, selected_questions, flat_answer_mapping)
    wvs_df_2012 = replace_answers(wvs_df_2012, selected_questions, flat_answer_mapping)
    
    # Merge
    default_values = {
                      'region':'default_region',
                      'urban_rural':'default_rural',
                      'age':'default_age','gender':'default_gender',
                      'marital_status':'default_unmarried',
                      'education_level':'default_education',
                      'social_class':'default_class'
                    }
    merge_columns = list(default_values.keys())
    merged_df = pd.merge(wvs_df_2022, wvs_df_2012, on=merge_columns, how='inner')
    if merged_df.empty: return {}
    
    # Find question columns (exclude demographics)
    question_cols = [q for q in selected_questions if q in merged_df.columns]

    diff_questions = []
    for q in question_cols:
        q_2022 = f"{q}_x" if f"{q}_x" in merged_df.columns else q
        q_2012 = f"{q}_y" if f"{q}_y" in merged_df.columns else q
        if q_2022 in merged_df.columns and q_2012 in merged_df.columns:
            if not merged_df[q_2022].equals(merged_df[q_2012]):
                diff_questions.append(q)

    print("\nQuestions with differing answers between 2022 and 2012:")
    print(diff_questions)

    # Compute metrics
    metrics = compute_metrics(merged_df, selected_questions, num_options_map, region_wise=region_wise, verbose=verbose)
    return metrics

results = analyze_survey_alignment(country='india', region_wise=True, verbose=True)
final_table = pd.DataFrame.from_dict(results, orient='index')
final_table.index.name = 'Region'
final_table.reset_index(inplace=True)
final_table['State'] = final_table['Region'].apply(lambda x: x.split()[0].split('-')[1].capitalize())
final_table = final_table[['State', 'Region', 'soft_metric', 'hard_metric']]
print(final_table)


Questions with differing answers between 2022 and 2012:
[]
  State               Region  soft_metric  hard_metric
0    Br          in-br bihar     0.706416     0.352090
1    Dl          in-dl delhi     0.628536     0.249258
2    Hr        in-hr haryana     0.676013     0.470588
3    Mh    in-mh maharashtra     0.721191     0.419561
4    Tg      in-tg telangana     0.729737     0.398620
5    Up  in-up uttar pradesh     0.709748     0.397093
6    Wb    in-wb west bengal     0.734043     0.424060


In [None]:
results = analyze_survey_alignment(country='india', region_wise=True, verbose=True)
final_table = pd.DataFrame.from_dict(results, orient='index')
final_table.index.name = 'Region'
final_table.reset_index(inplace=True)
final_table['State'] = final_table['Region'].apply(lambda x: x.split()[0].split('-')[1].capitalize())
final_table = final_table[['State', 'Region', 'soft_metric', 'hard_metric']]
print(final_table)

  State               Region  soft_metric  hard_metric
0    Br          in-br bihar     0.706416     0.352090
1    Dl          in-dl delhi     0.628536     0.249258
2    Hr        in-hr haryana     0.676013     0.470588
3    Mh    in-mh maharashtra     0.721191     0.419561
4    Tg      in-tg telangana     0.729737     0.398620
5    Up  in-up uttar pradesh     0.709748     0.397093
6    Wb    in-wb west bengal     0.734043     0.424060


In [26]:
results = analyze_survey_alignment(country='india', region_wise=True, verbose=True)
final_table = pd.DataFrame.from_dict(results, orient='index')
final_table.index.name = 'Region'
final_table.reset_index(inplace=True)
final_table['State'] = final_table['Region'].apply(lambda x: x.split()[0].split('-')[1].capitalize())
final_table = final_table[['State', 'Region', 'soft_metric', 'hard_metric']]
print(final_table)

NameError: name 'year' is not defined

In [None]:
def analyze_response_alignment(year='2022', mode='state', country='india', state='bengal', language='en', model='llama', region_wise=False, verbose=True):
    """
    Analyze survey alignment between WVS data and model responses.
    """
    
    wvs_filepath=f'../data/{country}/{year}/{year}_{country}.xlsx'
    if mode == 'state':
        filepath=f'../{model}_responses/survey_answers_{state}_{language}.csv'
    else:
        filepath = f'../{model}_responses/survey_answers_allstates_{country}_{language}.csv'
    questions_filepath=f'../data/translated_questions/questions_{language}.json'
    config_file='../data/chosen_cols_updated.json'
    mapping_file = '../data/qsns_mapping.json'
    
    answer_mappings_by_q, num_options_map = process_questions_config(questions_filepath)
    flat_answer_mapping = {normalize_text(k): v for q_map in answer_mappings_by_q.values() for k,v in q_map.items()}
    
    wvs_df = pd.read_excel(wvs_filepath, engine='openpyxl')
    model_df = pd.read_csv(filepath)
    for col in wvs_df.columns:
        wvs_df[col] = wvs_df[col].apply(normalize_text)
    for col in model_df.columns:
        model_df[col] = model_df[col].apply(normalize_text)

    # Rename column names
    rename_map = {col: col.split(':')[0].strip() for col in wvs_df.columns if ':' in col}
    wvs_df.rename(columns=rename_map, inplace=True)
    
    # Get demographic mappings
    demographic_mapping_wvs = get_demographic_mapping(year=year, country=country)
    demographic_mapping_responses = get_demographic_mapping(country=country)
    model_df.rename(columns=demographic_mapping_responses, inplace=True)
    wvs_df.rename(columns=demographic_mapping_wvs, inplace=True)
    
    print(demographic_mapping_wvs)
    print(demographic_mapping_responses)
    
    # Specific mapping of demographics
    wvs_df = map_demographics(wvs_df, country, year)
    
    # Year specific processing
    if year != '2022':
        with open(mapping_file, 'r') as f:
            qsns_mapping_data = json.load(f)
        qsns_mapping = qsns_mapping_data.get(str(year), {})
        valid_columns = set()
        rename_map_v_to_q = {}
        for col in wvs_df.columns:
            if col in qsns_mapping and qsns_mapping[col] is not None:
                rename_map_v_to_q[col] = qsns_mapping[col]
                valid_columns.add(col)
            elif col in demographic_mapping_wvs.values():
                valid_columns.add(col)
        wvs_df = wvs_df[list(valid_columns)]
        wvs_df.rename(columns=rename_map_v_to_q, inplace=True)
    
    # Get chosen questions
    with open(config_file, "r") as f:
        data = json.load(f)
    chosen_questions = [q for q, k in data['chosen_cols'].items() if k == True]
    selected_questions = [q for q in chosen_questions if q in wvs_df.columns and q in list(k.split("-")[0].strip() for k in model_df.columns)]
    persona_cols = list(demographic_mapping_wvs.values())

    wvs_melted = wvs_df.melt(id_vars=persona_cols, value_vars=selected_questions,
                        var_name='question', value_name='answer')

    model_melted = model_df.melt(id_vars=persona_cols, var_name='q_variant', value_name='answer')
    model_melted['question'] = model_melted['q_variant'].str.extract(r'(q\d+)')
    
    return model_melted, wvs_melted


In [19]:
model_df, wvs_df = analyze_response_alignment(year='2022', mode='state', country='india', state='bengal', language='en', model='llama', region_wise=False, verbose=True)

  warn("Workbook contains no default style, apply openpyxl's default")


{'A_YEAR': 'year', 'B_COUNTRY': 'country', 'N_REGION_ISO': 'region', 'H_URBRURAL': 'urban_rural', 'Q260': 'gender', 'X003R': 'age', 'Q272': 'language', 'Q273': 'marital_status', 'Q275R': 'education_level', 'Q287': 'social_class'}
{'A_YEAR': 'year', 'B_COUNTRY': 'country', 'N_REGION_ISO': 'region', 'H_URBRURAL': 'urban_rural', 'Q260': 'gender', 'X003R': 'age', 'Q272': 'language', 'Q273': 'marital_status', 'Q275R': 'education_level', 'Q287': 'social_class'}


KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['year']"

In [None]:
import pandas as pd
from scipy.spatial.distance import jensenshannon

# Read CSVs
csv1 = pd.read_csv("csv1.csv")
csv2 = pd.read_csv("csv2.csv")

# Identify persona columns and question columns
persona_cols = ['age', 'gender', 'region']
question_cols = [c for c in csv1.columns if c.startswith('q')]

# Melt csv1 so each row = one (persona, question, answer)
csv1_melted = csv1.melt(id_vars=persona_cols, value_vars=question_cols,
                        var_name='question', value_name='answer')

# Melt csv2 similarly but account for q1-0, q1-1...
csv2_melted = csv2.melt(id_vars=persona_cols, var_name='q_variant', value_name='answer')
csv2_melted['question'] = csv2_melted['q_variant'].str.extract(r'(q\d+)')

# Aggregate distributions
def get_distribution(df):
    return (
        df.groupby(persona_cols + ['question', 'answer'])
          .size()
          .groupby(level=persona_cols + ['question'])
          .apply(lambda x: x / x.sum())  # Normalize to probs
          .reset_index(name='prob')
    )

dist1 = get_distribution(csv1_melted)
dist2 = get_distribution(csv2_melted)

# Merge distributions for comparison
merged = pd.merge(dist1, dist2, on=persona_cols + ['question', 'answer'], 
                  how='outer', suffixes=('_csv1', '_csv2')).fillna(0)

# Compute Jensen-Shannon divergence per persona+question
results = (
    merged.groupby(persona_cols + ['question'])
    .apply(lambda g: jensenshannon(g['prob_csv1'], g['prob_csv2']))
    .reset_index(name='js_distance')
)

print(results.head())
