In [40]:
import pandas as pd

columns_to_keep = [
    'W1_IDNR', 'S00_SPRACHE', 'S01_SEX', 'S02', 'S02_AGEGRP', 'K00', 'F02_01', 'F02_02', 'F02_03', 'F02_04', 'F02_05',
    'F03', 'A11_01', 'A11_02', 'A11_03', 'A11_04', 'A11_05', 'A11_06', 'A11_07', 'A11_08', 'A11_09',
    'A12_01', 'A12_02', 'A12_03', 'A12_04', 'A12_05', 'A12_06', 'A12_07', 'K01', 'K02', 'K03', 'K04', 'K06',
    'PART', 'Q01_01', 'Q01_02', 'Q01_03', 'Q01_04', 'Q01_05', 'Q01_06', 'Q01_07', 'Q01_08', 'Q01_09', 'Q01_10',
    'Q01_11', 'Q01_12', 'Q01_13', 'Q01_14', 'Q01_15', 'Q01_16', 'Q01_17', 'Q01_18', 'Q01_19', 'Q01_20',
    'Q01_21', 'Q01_22', 'Q01_23', 'Q01_24', 'Q01_25', 'Q01_26', 'Q01_27', 'Q01_28', 'Q01_29', 'Q01_30',
    'Q01_31', 'Q01_32', 'Q01_33', 'Q01_WNKA', 'Q32_01', 'Q32_02', 'Q32_03', 'Q32_04', 'Q32_05', 'Q32_06',
    'Q32_07', 'Q32_08', 'Q32_09', 'Q32_10', 'Q32_11', 'Q32_12', 'Q32_13', 'Q32_14', 'Q32_15', 'Q32_16',
    'Q32_17', 'Q32_18', 'Q32_19', 'Q32_20', 'Q32_21', 'Q32_22', 'Q32_23', 'Q32_24', 'Q32_25', 'Q32_26',
    'Q32_27', 'Q32_28', 'Q32_29', 'Q32_30', 'Q32_31', 'Q32_32', 'Q32_33', 'Q32_WNKA', 'Geschlecht','Nationalitaet','Einwohnerstatus','Zivilstand','GEBMONATJAHR','AlterZP','Gebiet','Anzahl_Personen_Haushalt','Haushaltsform','ID_Subjekt'
]

q32_columns = [
    'Q32_01', 'Q32_02', 'Q32_03', 'Q32_04', 'Q32_05', 'Q32_06',
    'Q32_07', 'Q32_08', 'Q32_09', 'Q32_10', 'Q32_11', 'Q32_12',
    'Q32_13', 'Q32_14', 'Q32_15', 'Q32_16', 'Q32_17', 'Q32_18',
    'Q32_19', 'Q32_20', 'Q32_21', 'Q32_22', 'Q32_23', 'Q32_24',
    'Q32_25', 'Q32_26', 'Q32_27', 'Q32_28', 'Q32_29', 'Q32_30',
    'Q32_31', 'Q32_32', 'Q32_33'
]

w1_path = 'w1.csv'
w2_path = 'w2.csv'

df1 = pd.read_csv(w1_path)
df2 = pd.read_csv(w2_path)

merged_df = pd.merge(df1, df2, left_on='IDNR', right_on='W1_IDNR')

rows_in_w1 = df1.shape[0]
rows_in_w2 = df2.shape[0]
rows_in_combined = merged_df.shape[0]

merged_df['Q32_01'] = pd.to_numeric(merged_df['Q32_01'], errors='coerce')

filtered_df = merged_df[merged_df['Q32_01'].notna()]

filtered_df = filtered_df.loc[:, columns_to_keep]

def find_positive_q32(row):
    positive_cols = [col.split('_')[-1] for col in q32_columns if pd.notna(row[col]) and int(row[col]) > 0]
    return ', '.join(positive_cols)

filtered_df['votes'] = filtered_df.apply(find_positive_q32, axis=1)

filtered_df.shape

(510, 113)

In [77]:
def describe_person(row):

    education_levels = {
        1: 'No school graduation',
        2: 'Primary school',
        3: 'High school',
        4: 'Apprenticeship',
        5: 'Vocational training',
        6: 'General secondary education',
        7: 'Commercial diploma',
        8: 'Advanced secondary diploma',
        9: 'Gymnasium or teacher training',
        10: 'Advanced vocational training',
        11: 'Graduate of technical or business school',
        12: 'Graduate of applied sciences college or teacher training',
        13: 'University degree',
        98: 'Unknown education level',
        99: 'No response on education'
    }

    parents_birthplace = {
        1: 'both parents were born in Switzerland',
        2: 'one parent was born in Switzerland',
        3: 'both parents immigrated',
        98: 'does not know the birthplace of parents',
        99: 'did not answer about parents’ birthplace'
    }

    beneficiary_groups = {
        'A12_01': 'Families with children',
        'A12_02': 'Children',
        'A12_03': 'Youth',
        'A12_04': 'Adults',
        'A12_05': 'People with disabilities',
        'A12_06': 'Elderly',
        'A12_07': 'Poor people'
    }

    themes = {
        'A11_01': 'Education',
        'A11_02': 'Urban greenery',
        'A11_03': 'Public space',
        'A11_04': 'Welfare',
        'A11_05': 'Culture',
        'A11_06': 'Environmental protection',
        'A11_07': 'Public transit and roads',
        'A11_08': 'Sport',
        'A11_09': 'Health'
    }

    marital_status_translations = {
        'verheiratet': 'married',
        'ledig': 'single',
        'in eingetragener Partnerschaft': 'in a registered partnership',
        'geschieden': 'divorced',
        'verwitwet': 'widowed'
    }

    household_form_translations = {
        'Einzelhaushalt': 'single-person household',
        'Paar-/Zwei-Personenhaushalt': 'couple/two-person household',
        'Mehrpersonen-Haushalt ohne Kinder': 'multi-person household with no kids',
        'Mehrpersonen-Haushalt mit Kinder': 'multi-person households with children'
    }

    nationality_translations = {
        'Schweiz': 'Switzerland',
        'Afghanistan': 'Afghanistan',
        'Iran': 'Iran',
        'Spanien': 'Spain',
        'Deutschland': 'Germany',
        'Bosnien und Herzegowina': 'Bosnia and Herzegovina',
        'Österreich': 'Austria',
        'Griechenland': 'Greece',
        'Niederlande': 'Netherlands',
        'Vereinigte Staaten': 'United States',
        'Serbien': 'Serbia',
        'China': 'China',
        'Australien': 'Australia',
        'Tschechien': 'Czech Republic',
        'Italien': 'Italy',
        'Ukraine': 'Ukraine',
        'Polen': 'Poland',
        'Russland': 'Russia',
        'Frankreich': 'France',
        'Indien': 'India',
        'Kamerun': 'Cameroon',
        'Irak': 'Iraq',
        'Tunesien': 'Tunisia',
        'Mexiko': 'Mexico',
        'Türkiye': 'Turkey',
        'Litauen': 'Lithuania',
        'Bangladesch': 'Bangladesh',
        'Kosovo': 'Kosovo',
        'Portugal': 'Portugal',
        'Ungarn': 'Hungary'
    }

    row['Gender'] = "man" if row['S01_SEX'] == 1 else "woman"

    row['Age'] = row['S02']

    if row['F03'] in [0, 1]:
        row['Politics'] = "very left"
    elif row['F03'] < 5:
        row['Politics'] = "left-leaning"
    elif row['F03'] == 5:
        row['Politics'] = "moderate"
    elif row['F03'] < 9:
        row['Politics'] = "right-leaning"
    else:
        row['Politics'] = "very right"

    top_preferences = [themes[theme] for theme in themes if row[theme] == 1]
    row['Top Preferences'] = ', '.join(top_preferences)
    row['Education'] = education_levels.get(row['K01'], 'Unknown')
    row['Household Form'] = household_form_translations.get(row['Haushaltsform'], 'Unknown household form')
    translated_nationality = nationality_translations.get(row['Nationalitaet'], row['Nationalitaet'])
    row['Nationality'] = translated_nationality
    row['Marital Status'] = marital_status_translations.get(row['Zivilstand'], 'Unknown marital status')
    row['Area'] = row['Gebiet']

    row['Birthplace Info'] = "Has migration background" if row['K02'] == 0 or row['K03'] in [2, 3] else "No migration background"

    # Process children information
    row['Children Info'] = 'Has children' if row['K04'] == 1 else 'No children' if row['K04'] == 0 else 'Unknown children status'

    # Process beneficiary importance description
    important_beneficiaries = [beneficiary_groups[key] for key in beneficiary_groups if row[key] == 5]
    row['Important Beneficiaries'] = ', '.join(important_beneficiaries) if important_beneficiaries else 'None'

    description_parts = [
        f"You are a {row['Age']}-year-old {row['Gender']} living {row['Area']} in Aarau, and you are {row['Politics']} politically."
    ]

    if row['Nationality'] != 'Switzerland':
        description_parts.append(f"Your nationality is {row['Nationality']} and you live in a {row['Household Form']}.")
    else:
        description_parts.append(f"You live in a {row['Household Form']}.")

    if row['Top Preferences']:
        description_parts.append(f"Your top urban project preferences are: {row['Top Preferences']}.")

    if row['Important Beneficiaries'] != 'None':
        description_parts.append(f"For you, issues related to {row['Important Beneficiaries']} are very important.")

    description_parts.append(f"You have attained {row['Education']} as your highest level of education.")

    if row['Birthplace Info'] != 'No migration background':
        description_parts.append(row['Birthplace Info'])

    if row['Children Info'] not in ['Unknown children status', '']:
        description_parts.append(row['Children Info'])

    row['Description'] = ' '.join(description_parts)
    print(row['Description'])
    return row

# Apply the function to each row in the DataFrame
filtered_df = filtered_df.apply(describe_person, axis=1)

# Display the head of the DataFrame to verify
print(filtered_df.head())

You are a 35-year-old woman living Altstadt in Aarau, and you are left-leaning politically. You live in a single-person household. Your top urban project preferences are: Education, Urban greenery, Environmental protection. For you, issues related to Poor people are very important. You have attained Gymnasium or teacher training as your highest level of education. Has migration background
You are a 36-year-old woman living Hinterdorf in Aarau, and you are left-leaning politically. You live in a multi-person households with children. Your top urban project preferences are: Education, Environmental protection, Public transit and roads. For you, issues related to Families with children are very important. You have attained University degree as your highest level of education. Has children
You are a 41-year-old woman living Gönhard in Aarau, and you are left-leaning politically. You live in a multi-person households with children. Your top urban project preferences are: Education, Welfare,

In [78]:
# Define the columns to be saved
columns_to_save = ['ID_Subjekt', 'Gender', 'Age', 'Politics', 'Top Preferences',
                   'Education', 'Birthplace Info', 'Children Info', 'Important Beneficiaries', 'Description', 'Nationalitaet','Zivilstand','Gebiet','Haushaltsform', 'votes'
                   ]

# Create a subset of the DataFrame with only the specified columns
subset_df = filtered_df[columns_to_save]

# Filter the subset to include only rows where 'votes' is not empty
subset_df = subset_df[subset_df['votes'].notna() & (subset_df['votes'] != '')]

# Display the head of the DataFrame to verify
print(subset_df.head())
print(subset_df.shape)

# Save the filtered subset DataFrame to a CSV file
subset_df.to_csv('aarau_pb_data_des.csv', index=False)



  ID_Subjekt Gender  Age      Politics  \
0     105568  woman   35  left-leaning   
2     109007  woman   41  left-leaning   
4     110099    man   23  left-leaning   
6     104232    man   55    very right   
9     177339    man   34     very left   

                                     Top Preferences  \
0  Education, Urban greenery, Environmental prote...   
2                         Education, Welfare, Health   
4  Education, Environmental protection, Public tr...   
6  Education, Public space, Environmental protection   
9    Urban greenery, Environmental protection, Sport   

                                           Education  \
0                      Gymnasium or teacher training   
2  Graduate of applied sciences college or teache...   
4                         Advanced secondary diploma   
6                                  University degree   
9                                  University degree   

            Birthplace Info            Children Info  \
0  Has migration 