In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('/Users/carterhogan/CaseStudies/world_value_survey/analysis/data/wvs/WVS_Time_Series.csv')

In [2]:
# We want to now change political preference to a factored variable
pol_pref = ["Q240"]

# we need to remove all pol pref values less than 0





0

In [4]:
# baseline happiness average
avg_hap = df['Q46'].mean()
avg_hap

unique_hap = df['Q46'].unique()
unique_hap

# 

array([0.66666667, 0.33333333, 1.        , 0.        ])

In [None]:
    # Filter for individuals with valid political preferences, happiness values, and income
    merged_df = merged_df[(merged_df['Q240'] > 0) & (merged_df['Q46']>0) & (merged_df['Q288']>0)]
    # Make political preference a dummy variable
    pol_dummies = pd.get_dummies(merged_df['Q240'], prefix= "pol_value")
    merged_df = pd.concat([merged_df,pol_dummies], axis = 1)
    # Now find define average happiness and average income
    avg_hap = merged_df['Q46'].mean()

    merged_df = merged_df.assign(above_avg_hap  = merged_df['Q46']>=avg_hap)

In [None]:
def attach_income_index(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df['Q288']>0]
    avg_inc = merged_df['Q288'].mean()

    # define a two binary variables for income and happiness, where 1 represents above average and 0 below

    df = merged_df.assign(above_avg_inc  = (df['Q288']>=avg_inc).astype(int))

    return df

In [None]:
def attach_pol_pref(df: pd.DataFrame) -> pd.DataFrame:
  """
  # Creating a pol pref index

  Attaches the political preference of interviewees
  Parameters:
    df(pd.DataFrame): the wave7.csv dataframe stored by the data_filter_merge.ipynb
  
  Returns:
    pd.DataFrame: Processed dataframe with the the political preference 'pol_pref'
  """

  pol_pref = ["Q240"]
  #ensure we only use valid observations of political preference
  df = df[df['Q240'] > 0]
  pol_dummies = pd.get_dummies(merged_df['Q240'], prefix= "pol_value")
  df = pd.concat([df,pol_dummies], axis = 1)
  
  return df


In [None]:

def attach_happiness_index(df: pd.DataFrame) -> pd.DataFrame:
    #filter for valid happiness values
    df = df[df['Q46']>0] 
    # save median happiness value
    med_hap = df['Q46'].median()
    # create a binary variable for those with above and below median happiness
    df = df.assign(above_med_hap  = (df['Q46']>=med_hap).astype(int))
    
    hardships_questions = [f"Q{i}" for i in range(51, 56)]  # Composite index
   

    # Copy the DataFrame to work on
    result = df.copy()

    # 1. Convert 'Q56' to dummy variables -----------------------------------------
    # Replace invalid values in Q56
    result['Q56'] = result['Q56'].where(result['Q56'] > 0, pd.NA)

    # country based imputation with mode
    result['Q56'] = (
        result.groupby('B_COUNTRY_ALPHA')['Q56']
        .apply(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else 0))
        .reset_index(level=0, drop=True)  
    )

    # Generate dummy variables for Q56 with explicit integer type
    dummies = pd.get_dummies(result['Q56'], prefix='standard_parents').astype(int)

    # Generate dummy variables for Q56 with explicit integer type
    dummies = pd.get_dummies(result['Q56'], prefix='standard_parents').astype(int)

    # Rename the dummy columns
    dummies.rename(columns={
        'standard_parents_1.0': 'standard_parents_better',
        'standard_parents_2.0': 'standard_parents_worse',
        'standard_parents_3.0': 'standard_parents_same'
    }, inplace=True)

    dummies = dummies.drop(columns=['standard_parents_same'])

    # Attach the dummy variables to the DataFrame
    result = pd.concat([result, dummies], axis=1)

    # 2. Impute missing values with the median -------------------------------------
    # Create a median dictionary for countries
    median_dict = {}
    countries = result['B_COUNTRY_ALPHA'].unique()

    # Reverse the scale for hardships questions
    for hq in hardships_questions:
        result[hq] = result[hq].where(result[hq] <= 0, 4 + 1 - result[hq])

    for ct in countries:
        median_dict[ct] = {}
        for hq in hardships_questions:
            # Calculate median for each question within each country
            median_dict[ct][hq] = result.loc[(result[hq] > 0) & (result['B_COUNTRY_ALPHA'] == ct), hq].median()

    # Populate the DataFrame with the imputed values
    for hq in hardships_questions:
        result[hq] = result.apply(
            lambda row: median_dict[row['B_COUNTRY_ALPHA']][hq] if row[hq] <= 0 else row[hq], axis=1
        )

    # 3. Normalize happiness_questions using Min-Max Scaling ---------------------
    scaler = MinMaxScaler()
    scaler.fit(result.loc[:, hardships_questions])
    result.loc[:, hardships_questions] = scaler.transform(result.loc[:, hardships_questions])


    # Composite variable: Hardships 
    result['hardships_questions'] = result.loc[:, hardships_questions].mean(axis=1)

    return result