In [18]:
import pandas as pd

columns_to_keep = [
    'W1_IDNR', 'S00_SPRACHE', 'S01_SEX', 'S02', 'S02_AGEGRP', 'K00', 'F02_01', 'F02_02', 'F02_03', 'F02_04', 'F02_05',
    'F03', 'A11_01', 'A11_02', 'A11_03', 'A11_04', 'A11_05', 'A11_06', 'A11_07', 'A11_08', 'A11_09',
    'A12_01', 'A12_02', 'A12_03', 'A12_04', 'A12_05', 'A12_06', 'A12_07', 'K01', 'K02', 'K03', 'K04', 'K06',
    'PART', 'Q01_01', 'Q01_02', 'Q01_03', 'Q01_04', 'Q01_05', 'Q01_06', 'Q01_07', 'Q01_08', 'Q01_09', 'Q01_10',
    'Q01_11', 'Q01_12', 'Q01_13', 'Q01_14', 'Q01_15', 'Q01_16', 'Q01_17', 'Q01_18', 'Q01_19', 'Q01_20',
    'Q01_21', 'Q01_22', 'Q01_23', 'Q01_24', 'Q01_25', 'Q01_26', 'Q01_27', 'Q01_28', 'Q01_29', 'Q01_30',
    'Q01_31', 'Q01_32', 'Q01_33', 'Q01_WNKA', 'Q32_01', 'Q32_02', 'Q32_03', 'Q32_04', 'Q32_05', 'Q32_06',
    'Q32_07', 'Q32_08', 'Q32_09', 'Q32_10', 'Q32_11', 'Q32_12', 'Q32_13', 'Q32_14', 'Q32_15', 'Q32_16',
    'Q32_17', 'Q32_18', 'Q32_19', 'Q32_20', 'Q32_21', 'Q32_22', 'Q32_23', 'Q32_24', 'Q32_25', 'Q32_26',
    'Q32_27', 'Q32_28', 'Q32_29', 'Q32_30', 'Q32_31', 'Q32_32', 'Q32_33', 'Q32_WNKA', 'Geschlecht','Nationalitaet','Einwohnerstatus','Zivilstand','GEBMONATJAHR','AlterZP','Gebiet','Anzahl_Personen_Haushalt','Haushaltsform','ID_Subjekt'
]

q32_columns = [
    'Q32_01', 'Q32_02', 'Q32_03', 'Q32_04', 'Q32_05', 'Q32_06',
    'Q32_07', 'Q32_08', 'Q32_09', 'Q32_10', 'Q32_11', 'Q32_12',
    'Q32_13', 'Q32_14', 'Q32_15', 'Q32_16', 'Q32_17', 'Q32_18',
    'Q32_19', 'Q32_20', 'Q32_21', 'Q32_22', 'Q32_23', 'Q32_24',
    'Q32_25', 'Q32_26', 'Q32_27', 'Q32_28', 'Q32_29', 'Q32_30',
    'Q32_31', 'Q32_32', 'Q32_33'
]

w1_path = 'raw/w1_survey_data.csv'
w2_path = 'raw/w2_survey_data.csv'

df1 = pd.read_csv(w1_path)
df2 = pd.read_csv(w2_path)

merged_df = pd.merge(df1, df2, left_on='IDNR', right_on='W1_IDNR')

rows_in_w1 = df1.shape[0]
rows_in_w2 = df2.shape[0]
rows_in_combined = merged_df.shape[0]

merged_df['Q32_01'] = pd.to_numeric(merged_df['Q32_01'], errors='coerce')

filtered_df = merged_df[merged_df['Q32_01'].notna()]

filtered_df = filtered_df.loc[:, columns_to_keep]

def find_positive_q32(row):
    positive_cols = [col.split('_')[-1] for col in q32_columns if pd.notna(row[col]) and int(row[col]) > 0]
    return ', '.join(positive_cols)

filtered_df['votes'] = filtered_df.apply(find_positive_q32, axis=1)

filtered_df.shape

(510, 113)

In [20]:
def describe_person(row):

    education_levels = {
        1: 'basically educated',  # Combining No school, Primary, High school
        2: 'basically educated',
        3: 'basically educated',
        4: 'vocationally trained',  # Combining Apprenticeship, Vocational training
        5: 'vocationally trained',
        6: 'secondary-educated',  # General secondary, Commercial diploma, Advanced secondary diploma
        7: 'secondary-educated',
        8: 'secondary-educated',
        9: 'secondary-educated',
        10: 'highly educated',  # Advanced vocational training and University
        11: 'highly educated',
        12: 'highly educated',
        13: 'highly educated',
        98: 'unknown education level',
        99: 'no response on education'
    }

    beneficiary_groups = {
        'A12_01': 'families with children',
        'A12_02': 'children',
        'A12_03': 'youth',
        'A12_04': 'adults',
        'A12_05': 'people with disabilities',
        'A12_06': 'elderly',
        'A12_07': 'poor people'
    }

    themes = {
        'A11_01': 'education',
        'A11_02': 'urban greenery',
        'A11_03': 'public space',
        'A11_04': 'welfare',
        'A11_05': 'culture',
        'A11_06': 'environmental protection',
        'A11_07': 'public transit and roads',
        'A11_08': 'sport',
        'A11_09': 'health'
    }

    marital_status_translations = {
        'verheiratet': 'married',
        'ledig': 'single',
        'in eingetragener Partnerschaft': 'in a registered partnership',
        'geschieden': 'divorced',
        'verwitwet': 'widowed'
    }

    household_form_translations = {
        'Einzelhaushalt': 'single-person household',
        'Paar-/Zwei-Personenhaushalt': 'couple/two-person household',
        'Mehrpersonen-Haushalt ohne Kinder': 'multi-person household',
        'Mehrpersonen-Haushalt mit Kinder': 'multi-person households with children'
    }

    nationality_translations = {
        'Schweiz': 'Switzerland',
        'Afghanistan': 'Afghanistan',
        'Iran': 'Iran',
        'Spanien': 'Spain',
        'Deutschland': 'Germany',
        'Bosnien und Herzegowina': 'Bosnia and Herzegovina',
        'Österreich': 'Austria',
        'Griechenland': 'Greece',
        'Niederlande': 'Netherlands',
        'Vereinigte Staaten': 'United States',
        'Serbien': 'Serbia',
        'China': 'China',
        'Australien': 'Australia',
        'Tschechien': 'Czech Republic',
        'Italien': 'Italy',
        'Ukraine': 'Ukraine',
        'Polen': 'Poland',
        'Russland': 'Russia',
        'Frankreich': 'France',
        'Indien': 'India',
        'Kamerun': 'Cameroon',
        'Irak': 'Iraq',
        'Tunesien': 'Tunisia',
        'Mexiko': 'Mexico',
        'Türkiye': 'Turkey',
        'Litauen': 'Lithuania',
        'Bangladesch': 'Bangladesh',
        'Kosovo': 'Kosovo',
        'Portugal': 'Portugal',
        'Ungarn': 'Hungary'
    }

    def get_age_group(age):
        if age < 18:
            return 'Under 18'
        elif 18 <= age <= 29:
            return '18-29'
        elif 30 <= age <= 44:
            return '30-44'
        elif 45 <= age <= 64:
            return '45-64'
        else:
            return '65+'

    row['Gender'] = "man" if row['S01_SEX'] == 1 else "woman"

    row['Age'] = row['S02']
    row['Age Group'] = get_age_group(row['Age'])

    if row['F03'] in [0, 1]:
        row['Politics'] = "very left"
    elif row['F03'] < 5:
        row['Politics'] = "left-leaning"
    elif row['F03'] == 5:
        row['Politics'] = "moderate"
    elif row['F03'] < 9:
        row['Politics'] = "right-leaning"
    else:
        row['Politics'] = "very right"

    top_preferences = [themes[theme] for theme in themes if row[theme] == 1]
    row['Top Preferences'] = ', '.join(top_preferences)
    row['Education'] = education_levels.get(row['K01'], 'Unknown')
    row['Household Form'] = household_form_translations.get(row['Haushaltsform'], 'Unknown household form')
    translated_nationality = nationality_translations.get(row['Nationalitaet'], row['Nationalitaet'])
    row['Nationality'] = translated_nationality
    row['Marital Status'] = marital_status_translations.get(row['Zivilstand'], 'Unknown marital status')
    row['Area'] = row['Gebiet']

    row['Birthplace Info'] = "Has migration background" if row['K02'] == 0 or row['K03'] in [2, 3] else "No migration background"

    row['Children Info'] = 'Has children' if row['K04'] == 1 else 'No children' if row['K04'] == 2 else 'Unknown children status'

    important_beneficiaries = [beneficiary_groups[key] for key in beneficiary_groups if row[key] == 5]
    row['Important Beneficiaries'] = ', '.join(important_beneficiaries) if important_beneficiaries else 'None'

    description_parts = [
        f"You are a {row['Age']}-year-old {row['Gender']} living in {row['Area']} in Aarau, and you are {row['Politics']} politically."
    ]

    household_part = f"You live in a {row.get('Household Form', 'household')}."
    if row.get('Children Info') not in ['Unknown children status', '']:
        household_part += " You have children." if 'Has children' in row.get('Children Info', '') else ""
    description_parts.append(household_part)

    if row.get('Nationality') and row['Nationality'] != 'Switzerland':
        description_parts.append(f"You are originally from {row['Nationality']}.")

    if row.get('Birthplace Info') == "Has migration background":
        description_parts.append("You have a migration background.")

    if row.get('Top Preferences'):
        description_parts.append(f"Your top urban project preferences are: {row['Top Preferences']}.")

    if row.get('Important Beneficiaries') and row['Important Beneficiaries'] != 'None':
        description_parts.append(f"For you, issues related to {row['Important Beneficiaries']} are very important.")

    if row.get('Education') == "highly educated":
        description_parts.append(f"You are highly educated.")

    row['Description'] = ' '.join(description_parts)

    print(row['Description'])
    return row

filtered_df = filtered_df.apply(describe_person, axis=1)

print(filtered_df.head())

You are a 35-year-old woman living in Altstadt in Aarau, and you are left-leaning politically. You live in a single-person household. You have a migration background. Your top urban project preferences are: education, urban greenery, environmental protection. For you, issues related to poor people are very important.
You are a 36-year-old woman living in Hinterdorf in Aarau, and you are left-leaning politically. You live in a multi-person households with children. You have children. Your top urban project preferences are: education, environmental protection, public transit and roads. For you, issues related to families with children are very important. You are highly educated.
You are a 41-year-old woman living in Gönhard in Aarau, and you are left-leaning politically. You live in a multi-person households with children. You have children. Your top urban project preferences are: education, welfare, health. For you, issues related to families with children, children, youth, adults, peop

In [22]:
def analyze_group_preferences(df, political_group):
    filtered_group = df[df['Politics'].isin(political_group)]
    average_age = filtered_group['Age'].mean()

    top_preferences = filtered_group['Top Preferences'].str.get_dummies(sep=', ').sum()
    sorted_top_preferences = top_preferences.sort_values(ascending=False)

    important_beneficiaries = filtered_group['Important Beneficiaries'].str.get_dummies(sep=', ').sum()
    sorted_important_beneficiaries = important_beneficiaries.sort_values(ascending=False)

    return average_age, sorted_top_preferences, sorted_important_beneficiaries

left_group = ['very left', 'left-leaning']
right_group = ['very right', 'right-leaning']

left_age, left_preferences, left_beneficiaries = analyze_group_preferences(filtered_df, left_group)
right_age, right_preferences, right_beneficiaries = analyze_group_preferences(filtered_df, right_group)

def print_group_analysis(group_name, age, preferences, beneficiaries):
    print(f"{group_name} Group\n")
    print(f"Average Age: {age:.2f} years")
    print("\nTop Preferences:")
    for preference, count in preferences.items():
        print(f"  - {preference.capitalize()}: {count}")
    print("\nImportant Beneficiaries:")
    for beneficiary, count in beneficiaries.items():
        print(f"  - {beneficiary.capitalize()}: {count}")
    print("\n")

print_group_analysis("Left", left_age, left_preferences, left_beneficiaries)
print_group_analysis("Right", right_age, right_preferences, right_beneficiaries)


Left Group

Average Age: 46.68 years

Top Preferences:
  - Environmental protection: 176
  - Education: 161
  - Welfare: 133
  - Urban greenery: 101
  - Public transit and roads: 90
  - Public space: 89
  - Culture: 66
  - Health: 58
  - Sport: 29

Important Beneficiaries:
  - Poor people: 128
  - Children: 119
  - Youth: 113
  - None: 102
  - Families with children: 96
  - People with disabilities: 80
  - Elderly: 49
  - Adults: 32


Right Group

Average Age: 54.68 years

Top Preferences:
  - Education: 70
  - Public transit and roads: 63
  - Health: 53
  - Environmental protection: 44
  - Sport: 44
  - Urban greenery: 41
  - Public space: 35
  - Welfare: 32
  - Culture: 24

Important Beneficiaries:
  - None: 66
  - Children: 36
  - Families with children: 35
  - People with disabilities: 34
  - Youth: 32
  - Elderly: 30
  - Poor people: 27
  - Adults: 16




In [24]:
# Define the columns to be saved
columns_to_save = ['ID_Subjekt', 'Gender', 'Age', 'Politics', 'Top Preferences',
                   'Education', 'Birthplace Info', 'Children Info', 'Important Beneficiaries', 'Description', 'Nationalitaet','Zivilstand','Gebiet','Haushaltsform', 'votes'
                   ]

# Create a subset of the DataFrame with only the specified columns
subset_df = filtered_df[columns_to_save]

# Filter the subset to include only rows where 'votes' is not empty
subset_df = subset_df[subset_df['votes'].notna() & (subset_df['votes'] != '')]

# Display the head of the DataFrame to verify
print(subset_df.head())
print(subset_df.shape)

# Save the filtered subset DataFrame to a CSV file
subset_df.to_csv('processed/aarau_pb_vote_final.csv', index=False)



  ID_Subjekt Gender  Age      Politics  \
0     105568  woman   35  left-leaning   
2     109007  woman   41  left-leaning   
4     110099    man   23  left-leaning   
6     104232    man   55    very right   
9     177339    man   34     very left   

                                     Top Preferences           Education  \
0  education, urban greenery, environmental prote...  secondary-educated   
2                         education, welfare, health     highly educated   
4  education, environmental protection, public tr...  secondary-educated   
6  education, public space, environmental protection     highly educated   
9    urban greenery, environmental protection, sport     highly educated   

            Birthplace Info Children Info  \
0  Has migration background   No children   
2   No migration background  Has children   
4  Has migration background   No children   
6  Has migration background   No children   
9   No migration background   No children   

                   