In [1]:
import pandas

# Extract relevant data from SNN11 dataset

In [2]:
demographic_data_nl = pandas.read_csv("ESS11-NL-subset/ESS11-subset.csv")
# df_noord_holland_urban = demographic_data_nl[(demographic_data_nl['region'] == 'NL33') & (demographic_data_nl['domicil'] == 1) | (demographic_data_nl['domicil'] == 2)] #TODO: decide filtering; filtered by Noord-Holland and Urban
df_noord_holland_urban = demographic_data_nl[(demographic_data_nl['domicil'] == 1) | (demographic_data_nl['domicil'] == 2)] #TODO: decide filtering; filtered by Noord-Holland and Urban
print(len(df_noord_holland_urban))


463


In [3]:
chosen_cols = ['idno', #id
               'agea', #age
                'gndr', #sex
                'nobingnd', #gender
                'cntbrthd', 'ctzcntr','fbrncntc', 'mbrncntc', 'feethngr', #ethnicity
                'rlgblg', 'rlgdnanl', #religion; NB: replace with 'rlgblg' for generalized religion
                'edlvenl', #education; NB: replace with 'edulvlb' for generalized education
                'mainact', #employment/main activity
                'fltdpr', 'flteeff', 'slprl', 'wrhpp', 'fltlnl', 'enjlf', 'fltsd', 'cldgng', #well-being: proxies for depreeesion (PHQ-9)
                ]

rename_dict = {
    'idno': 'ParticipantID',
    'agea': 'Age',
    'gndr': 'SexAtBirth',
    'nobingnd': 'GenderIdentitySelfReported',
    'cntbrthd': 'CountryOfBirth', 'ctzcntr': 'Nationality', 'fbrncntc': 'FatherCountryOfBirth', 'mbrncntc': 'MotherCountryOfBirth', #nationality
    'feethngr': 'FeelsSameEthnicGroup',
    'rlgblg': 'ReligiousBelonging',
    'rlgdnanl': 'ReligiousDenominationNL',
    'edlvenl': 'EducationLevelNL',
    'mainact': 'WorkStatus',
    'fltdpr': 'FeltDepressed',
    'flteeff': 'LowEnergy',
    'slprl': 'SleepProblems',
    'wrhpp': 'FeltHappy',
    'fltlnl': 'FeltLonely',
    'enjlf': 'EnjoyedLife',
    'fltsd': 'FeltSad',
    'cldgng': 'Trouble'
}


In [4]:
df = demographic_data_nl[chosen_cols]
df = df.rename(columns=rename_dict)

print(df.head())

   ParticipantID  Age  SexAtBirth  GenderIdentitySelfReported CountryOfBirth  \
0          50072   36           1                           1           6666   
1          50144   48           1                           1           6666   
2          50166   59           1                           1             IR   
3          50193   46           2                           2           6666   
4          50202   80           2                           2           6666   

   Nationality FatherCountryOfBirth MotherCountryOfBirth  \
0            1                 6666                 6666   
1            1                 6666                 6666   
2            1                   IR                   IR   
3            1                 6666                 6666   
4            1                 6666                 6666   

   FeelsSameEthnicGroup  ReligiousBelonging  ...  EducationLevelNL  \
0                     1                   2  ...                14   
1                 

In [5]:
#export
df.to_csv('out.csv', index=False)

# Translate the dataset into our cohort questionnaire data structure

Categorize age

In [6]:
def translate_age(age):
    if age < 18:
        return 'Onder de 18 jaar'
    elif 18 <= age <= 24:
        return '18-24 jaar'
    elif 25 <= age <= 34:
        return '25-34 jaar'
    elif 35 <= age <= 44:
        return '35-44 jaar'
    elif 45 <= age <= 54:
        return '45-54 jaar'
    elif 55 <= age <= 64:
        return '55-64 jaar'
    else:
        return '65 jaar of ouder'

df['AgeGroup'] = df['Age'].apply(translate_age)

Categorize sex

In [7]:
df['SexAtBirth'] = df['SexAtBirth'].map({1: 'Man', 2: 'Vrouw'})

Categorize gender identity

In [8]:
def translate_gender_identity(row): #TODO: check if this corresponds to the questionnaire
    if row['GenderIdentitySelfReported'] == 1 and row['SexAtBirth'] == 'Man':
        return 'Man'
    elif row['GenderIdentitySelfReported'] == 2 and row['SexAtBirth'] == 'Vrouw':
        return 'Vrouw'
    elif row['GenderIdentitySelfReported'] == 2 and row['SexAtBirth'] == 'Man':
        return 'Trans vrouw'
    elif row['GenderIdentitySelfReported'] == 1 and row['SexAtBirth'] == 'Vrouw':
        return 'Trans man'
    elif row['GenderIdentitySelfReported'] == 3:
        return 'Anders (specificeer)'
    elif row['GenderIdentitySelfReported'] in [7, 8, 9]:
        return 'Ik wil het liever niet zeggen'
    else:
        return 'Onbekend'
    
df['GenderIdentity'] = df.apply(translate_gender_identity, axis=1)


Education mapping

In [9]:
education_mapping_snn11 = {
    1: 'Basisonderwijs',
    2: 'Basisonderwijs',
    3: 'Vmbo/Mavo',
    4: 'Vmbo/Mavo',
    5: 'Mbo (niveau 1-4)',
    6: 'Havo/Vwo',
    7: 'Havo/Vwo',
    8: 'Mbo (niveau 1-4)',
    9: 'Mbo (niveau 1-4)',
    10: 'Mbo (niveau 1-4)',
    11: 'Mbo (niveau 1-4)',
    12: 'HBO',
    13: 'HBO',
    14: 'HBO',
    15: 'Universiteit (Bachelor, Master, of hoger)',
    16: 'Universiteit (Bachelor, Master, of hoger)',
    17: 'Universiteit (Bachelor, Master, of hoger)',
    18: 'Universiteit (Bachelor, Master, of hoger)',
    5555: 'Anders (specificeer)',
    7777: 'Ik wil het liever niet zeggen',
    8888: 'Ik wil het liever niet zeggen',
    9999: 'Ik wil het liever niet zeggen'
}

df['EducationLevel'] = df['EducationLevelNL'].map(education_mapping_snn11).fillna('Anders (specificeer)')

Employment mapping

In [10]:
work_status_mapping_snn11 = {
    1: 'Betaalde baan (fulltime)',
    2: 'Scholier of student',
    3: 'Geen betaald werk om andere redenen',
    4: 'Geen betaald werk om andere redenen',
    5: 'Gedeeltelijk / geen betaald werk vanwege gezondheidsproblemen',
    6: 'Gepensioneerd of met prepensioen',
    7: 'Vrijwilligerswerk',
    8: 'Zorg voor het huishouden (eventueel kinderen)',
    9: 'Anders (specificeer)',
    66: 'Ik wil het liever niet zeggen',
    77: 'Ik wil het liever niet zeggen',
    88: 'Ik wil het liever niet zeggen',
    99: 'Ik wil het liever niet zeggen'
}

df['EducationLevel'] = df['EducationLevelNL'].map(education_mapping_snn11).fillna('Anders (specificeer)')

Religious mapping

In [11]:
religion_mapping_snn11 = {
    1: 'Christendom',
    2: 'Christendom',
    3: 'Christendom',
    4: 'Christendom',
    5: 'Christendom',
    6: 'Christendom',
    7: 'Christendom',
    8: 'Christendom',
    9: 'Christendom',
    10: 'Christendom',
    11: 'Christendom',
    12: 'Christendom',
    13: 'Christendom',
    14: 'Christendom',
    15: 'Christendom',
    16: 'Christendom',
    17: 'Christendom',
    18: 'Christendom',
    19: 'Christendom',
    20: 'Christendom',
    21: 'Christendom',
    22: 'Hindoeïsme',
    23: 'Boeddhisme',
    24: 'Anders (specificeer)',
    25: 'Jodendom',
    26: 'Islam',
    27: 'Anders (specificeer)',
    28: 'Anders (specificeer)',
    6666: 'Geen religie',
    7777: 'Ik wil het liever niet zeggen',
    9999: 'Ik wil het liever niet zeggen'
}


df['Religion'] = df['ReligiousDenominationNL'].map(religion_mapping_snn11).fillna('Anders (specificeer)')


Ethnicity and nationality variables

In [12]:
#NB: country of birth variables do not need to be translated, only nationality and ethnicity

#nationality
def infer_nationality(row):
    if row['CountryOfBirth'] == 'NL':
        return 'NL'  # Born in NL → assume Dutch nationality
    if row['MotherCountryOfBirth'] == 'NL' or row['FatherCountryOfBirth'] == 'NL':
        return 'NL'  # Parent born in NL → probably Dutch nationality
    # Otherwise, keep the respondent's own country of birth code
    return row['CountryOfBirth']

df['Nationality'] = df.apply(infer_nationality, axis=1)

#ethnicity: based nationality and whether they feel they are a minority impute


def infer_ethnicity(row):
    countries = [row['CountryOfBirth'], row['MotherCountryOfBirth'], row['FatherCountryOfBirth']]
    
    # Fully Dutch and feels like majority
    if all(c == 'NL' for c in countries) and row['FeelsSameEthnicGroup'] in [1, 2, 3]:
        return 'Nederlands'
    
    # First-generation immigrant case (born abroad)
    if row['CountryOfBirth'] != 'NL':
        if row['CountryOfBirth'] == 'TR':
            return 'Turks'
        elif row['CountryOfBirth'] == 'MA':
            return 'Marokkaans'
        elif row['CountryOfBirth'] == 'SR':
            return 'Surinaams'
        elif row['CountryOfBirth'] in ['AN', 'CW', 'AW']:  # Netherlands Antilles, Curaçao, Aruba
            return 'Antilliaans of Arubaan'
        elif row['CountryOfBirth'] == 'ID':
            return 'Indonesisch'
        else:
            return 'Anders (specificeer)'
    
    # Second-generation immigrant case (parent born abroad)
    if row['MotherCountryOfBirth'] != 'NL' or row['FatherCountryOfBirth'] != 'NL':
        if 'TR' in countries:
            return 'Turks'
        elif 'MA' in countries:
            return 'Marokkaans'
        elif 'SR' in countries:
            return 'Surinaams'
        elif any(c in ['AN', 'CW', 'AW'] for c in countries):
            return 'Antilliaans of Arubaan'
        elif 'ID' in countries:
            return 'Indonesisch'
        else:
            return 'Anders (specificeer)'
    
    
    return 'Nederlands' # Default fallback

df['Ethnicity'] = df.apply(infer_ethnicity, axis=1)


In [None]:
df.head() #TODO: look if data is clean. Now country of birth values are not; check religion

Unnamed: 0,ParticipantID,Age,SexAtBirth,GenderIdentitySelfReported,CountryOfBirth,Nationality,FatherCountryOfBirth,MotherCountryOfBirth,FeelsSameEthnicGroup,ReligiousBelonging,...,FeltHappy,FeltLonely,EnjoyedLife,FeltSad,Trouble,AgeGroup,GenderIdentity,EducationLevel,Religion,Ethnicity
0,50072,36,Man,1,6666,6666,6666,6666,1,2,...,4,1,4,1,1,35-44 jaar,Man,HBO,Geen religie,Anders (specificeer)
1,50144,48,Man,1,6666,6666,6666,6666,1,2,...,3,1,4,2,1,45-54 jaar,Man,HBO,Geen religie,Anders (specificeer)
2,50166,59,Man,1,IR,IR,IR,IR,1,2,...,3,1,3,1,1,55-64 jaar,Man,"Universiteit (Bachelor, Master, of hoger)",Geen religie,Anders (specificeer)
3,50193,46,Vrouw,2,6666,6666,6666,6666,1,2,...,4,1,3,1,2,45-54 jaar,Vrouw,Mbo (niveau 1-4),Geen religie,Anders (specificeer)
4,50202,80,Vrouw,2,6666,6666,6666,6666,1,2,...,2,1,4,2,4,65 jaar of ouder,Vrouw,Vmbo/Mavo,Geen religie,Anders (specificeer)
