## Others

In [None]:
import pandas as pd
import os

# List of states
states = ['bihar', 'delhi', 'haryana', 'maharashtra', 'punjab', 'telangana', 'up', 'bengal']
language_code = 'en'

# Initialize an empty DataFrame
combined_df = pd.DataFrame()

# Read and concatenate data for all states
for state in states:
    file_path = f"responses/llama_responses/most_frequent_answers_{state}_{language_code}.csv"
    if os.path.exists(file_path):
        state_df = pd.read_csv(file_path)
        state_df['State'] = state  # Add a column for the state
        combined_df = pd.concat([combined_df, state_df], ignore_index=True)
    else:
        print(f"File not found: {file_path}")

# Save the combined DataFrame to a CSV file
combined_df.to_csv(f"responses/llama_responses/most_frequent_answers_allstates_india_{language_code}.csv", index=False)

In [None]:
import json

# Load input files
with open('data/chosen_cols.json', 'r') as f:
    chosen_cols = json.load(f)

with open('data/qsns_mapping.json', 'r') as f:
    qsns_mapping = json.load(f)

with open('data/questions.json', 'r') as f:
    questions = json.load(f)

# Compute the list of true_qsns
true_qsns = [
    qsn for qsn, val in chosen_cols['chosen_cols'].items()
    if val 
    and qsn in qsns_mapping['2012'].values() 
    and qsn in qsns_mapping['2006'].values() 
    and qsn in questions
]

print(f"Number of matching questions: {len(true_qsns)}")

with open('data/chosen_cols_updated.json', 'r') as f:
    data = json.load(f)
    
for key in data['chosen_cols']:
    if key in true_qsns:
        data['chosen_cols'][key] = True
    else:
        data['chosen_cols'][key] = False

with open('data/chosen_cols_updated.json', 'w') as f:
    json.dump(data, f, indent=4)

Number of matching questions: 91


In [23]:
selected_questions_only_text = {qsn: questions[qsn] for qsn in true_qsns}
with open('data/selected_questions_only.json', 'w') as f:
    json.dump(selected_questions_only_text, f, indent=4)

In [None]:
from collections import defaultdict

with open('data/themes.json', 'r') as f:
    themes = json.load(f)

# Prepare ranges as tuples (start, end) for easier comparison
theme_ranges = {}
for key, theme_name in themes.items():
    start, end = map(int, key.split('-'))
    theme_ranges[(start, end)] = theme_name

# Count questions per theme
theme_counts = defaultdict(int)

for q in true_qsns:
    # Extract question number as integer
    q_num = int(q[1:]) 
    for (start, end), theme_name in theme_ranges.items():
        if start <= q_num <= end:
            theme_counts[theme_name] += 1
            break 

# Display results
for theme, count in theme_counts.items():
    print(f"{theme}: {count} question(s)")

SOCIAL VALUES, ATTITUDES & STEREOTYPES: 24 question(s)
SOCIAL CAPITAL, TRUST & ORGANIZATIONAL MEMBERSHIP: 23 question(s)
ECONOMIC VALUES: 6 question(s)
SECURITY: 1 question(s)
POSTMATERIALIST INDEX: 6 question(s)
SCIENCE & TECHNOLOGY: 4 question(s)
RELIGIOUS VALUES: 1 question(s)
ETHICAL VALUES AND NORMS: 11 question(s)
POLITICAL CULTURE & POLITICAL REGIMES: 15 question(s)


## Start

In [None]:
year = '2022'
country = 'india'
language_code = 'en'
filename = '{year}_{country}'.format(year=year, country=country)

In [2]:
import pandas as pd

df = pd.read_excel(f"data/{country}/{year}/{filename}.xlsx")

rows, cols = df.shape
print(f"Rows: {rows}, Columns: {cols}")

  warn("Workbook contains no default style, apply openpyxl's default")


Rows: 1810, Columns: 396


In [3]:
import json

with open("data/chosen_cols_updated.json", "r") as f:
    data = json.load(f)
    chosen_cols = [colname for colname in data["chosen_cols"] if data["chosen_cols"][colname] is True]
    persona_cols = data["persona_cols"][year].get(country, data["persona_cols"][year]['all'])

## Chosen Persona Features

In [4]:
all_persona_cols = list(persona_cols.values())
grouped = df[all_persona_cols].groupby(all_persona_cols).size().reset_index(name='Counts')
grouped.to_csv(f"data/{country}/{year}/{filename}_persona_groups_{language_code}.csv", index=False)

In [5]:
grouped_cleaned = grouped[grouped["Counts"] >= 4]
grouped_cleaned.to_csv(f"data/{country}/{year}/{filename}_persona_groups_cleaned_{language_code}.csv", index=False)

## Persona Majority Answers

In [1]:
import pandas as pd
import numpy as np

def calculate_majority_answers(year='2022', country='india', language_code='en'):
    persona_filepath = f"data/{country}/{year}/{year}_{country}_persona_groups_cleaned_{language_code}.csv"
    raw_data_filepath = f"data/{country}/{year}/{year}_{country}.xlsx"
    output_filepath = f"data/{country}/{year}/{year}_{country}_majority_answers_by_persona_{language_code}.csv"

    personas_df = pd.read_csv(persona_filepath)
    raw_data_df = pd.read_excel(raw_data_filepath)

    # Identify the demographic columns to group by
    demographic_columns = personas_df.columns.drop('Counts').tolist()

    # Identify question columns (contain ':' but are not demographics)
    question_columns = [col for col in raw_data_df.columns if ':' in col and col not in demographic_columns]

    print(f"Grouping by {len(demographic_columns)} demographic columns.")
    print(f"Analyzing {len(question_columns)} question columns.")

    # Ensure both DataFrames have consistent types for grouping columns
    for col in demographic_columns:
        personas_df[col] = personas_df[col].astype(str)
        raw_data_df[col] = raw_data_df[col].astype(str)

    # Function to compute majority value (mode)
    def get_majority(series):
        modes = series.mode()
        return modes[0] if not modes.empty else np.nan

    # Create aggregation mapping
    agg_dict = {col: get_majority for col in question_columns}

    # Replace values
    print("Calculating majority answers for each persona group...")
    majority_answers_df = raw_data_df.groupby(demographic_columns).agg(agg_dict).reset_index()

    # Merge results
    final_df = pd.merge(personas_df, majority_answers_df, on=demographic_columns, how='left')
    
    for col in question_columns:
        final_df[col] = final_df[col].astype(str).apply(lambda x: ' '.join(x.split()))

    # Save final result
    final_df.to_csv(output_filepath, index=False)
    print(f"\nSuccessfully created output file at: {output_filepath}")

    return final_df

In [2]:
year = '2022'
countries = ['india', 'US', 'russia', 'japan', 'colombia', 'egypt']
language_codes = ['en', 'ru', 'hi', 'ja', 'sp', 'ar']

for country, language_code in zip(countries, language_codes):
    print(f"\nProcessing country: {country}, language: {language_code}")
    calculate_majority_answers(year=year, country=country, language_code=language_code)


Processing country: india, language: en


  warn("Workbook contains no default style, apply openpyxl's default")


Grouping by 10 demographic columns.
Analyzing 445 question columns.
Calculating majority answers for each persona group...

Successfully created output file at: data/india/2022/2022_india_majority_answers_by_persona_en.csv

Processing country: US, language: ru


FileNotFoundError: [Errno 2] No such file or directory: 'data/US/2022/2022_US_persona_groups_cleaned_ru.csv'

In [7]:
result_df = calculate_majority_answers(year=year, country=country, language_code=language_code)

  warn("Workbook contains no default style, apply openpyxl's default")


Grouping by 10 demographic columns.
Analyzing 386 question columns.
Calculating majority answers for each persona group...

Successfully created output file at: data/russia/2022/2022_russia_majority_answers_by_persona_en.csv


## Chosen Columns

In [None]:
from IPython.display import display
import pandas as pd

def is_chosen_col(col):
    qsn = col.split(':')[0].strip()
    return qsn in chosen_cols

chosen_col_names = list(persona_cols.values()) + [col for col in df.columns if is_chosen_col(col)]
chosen = df[chosen_col_names]
chosen.to_csv(f"data/{country}/{year}/{filename}_cleaned_{language_code}.csv", index=False)

rows, cols = chosen.shape
print(f"Rows: {rows}, Columns: {cols}")

Rows: 2500, Columns: 10
