In [1]:
import pandas
import random
import json
from collections import Counter

# Extract relevant data from SNN11 dataset

In [2]:
demographic_data_nl = pandas.read_csv("ESS11-NL-subset/ESS11-subset.csv")
# df_noord_holland_urban = demographic_data_nl[(demographic_data_nl['region'] == 'NL33') & (demographic_data_nl['domicil'] == 1) | (demographic_data_nl['domicil'] == 2)] #urban areas in Noord Holland
df_NL_urban = demographic_data_nl[(demographic_data_nl['domicil'] == 1) | (demographic_data_nl['domicil'] == 2)] #Urban (and suburban/urban-adjacent) areas in the netherlands

pandas.set_option('display.max_columns', None)


In [3]:
chosen_cols = ['idno', #id
               'agea', #age
                'gndr', #sex
                'nobingnd', #gender
                'cntbrthd', 'brncntr', 'ctzcntr', 'facntr', 'fbrncntc', 'mocntr', 'mbrncntc', 'feethngr', 'anctrya1', 'anctrya2',#ethnicity and nationality
                'rlgblg', 'rlgdnanl', #religion; NB: replace with 'rlgblg' for generalized religion
                'edlvenl', #education; NB: replace with 'edulvlb' for generalized education
                'mainact', #employment/main activity
                'fltdpr', 'flteeff', 'slprl', 'wrhpp', 'fltlnl', 'enjlf', 'fltsd', 'cldgng', #well-being: proxies for depreeesion (PHQ-9)
                ]

rename_dict = {
    'idno': 'ParticipantID',
    'agea': 'Age',
    'gndr': 'SexAtBirth',
    'nobingnd': 'GenderIdentitySelfReported',
    'cntbrthd': 'CountryOfBirth', 'brncntr': 'BornInNL', 'ctzcntr': 'CitizenOfNL', 'facntr':'FatherBornInNL', 'fbrncntc': 'FatherCountryOfBirth', 'mocntr':'MotherBornInNL', 'mbrncntc': 'MotherCountryOfBirth', 'anctrya1':'ancestry1', 'anctrya2':'ancestry2',#nationality
    'feethngr': 'FeelsSameEthnicGroup',
    'rlgblg': 'ReligiousBelonging',
    'rlgdnanl': 'ReligiousDenominationNL',
    'edlvenl': 'EducationLevelNL',
    'mainact': 'WorkStatus',
    'fltdpr': 'FeltDepressed',
    'flteeff': 'LowEnergy',
    'slprl': 'SleepProblems',
    'wrhpp': 'FeltHappy',
    'fltlnl': 'FeltLonely',
    'enjlf': 'EnjoyedLife',
    'fltsd': 'FeltSad',
    'cldgng': 'TroubleGettingGoing'
}


In [4]:
df = df_NL_urban[chosen_cols]
df = df.rename(columns=rename_dict)

# print(df.head())

# Filter rows where CountryOfBirth and Nationality differ
diff_nat_birth = df[(df['BornInNL'] == 2) & (df['CitizenOfNL'] == 1)]
print(len(diff_nat_birth))


47


# Translate the dataset into our cohort questionnaire data structure

Categorize age

In [5]:
def translate_age(age):
    if age < 18:
        return 'Onder de 18 jaar'
    elif 18 <= age <= 24:
        return '18-24 jaar'
    elif 25 <= age <= 34:
        return '25-34 jaar'
    elif 35 <= age <= 44:
        return '35-44 jaar'
    elif 45 <= age <= 54:
        return '45-54 jaar'
    elif 55 <= age <= 64:
        return '55-64 jaar'
    else:
        return '65 jaar of ouder'

df['AgeGroup'] = df['Age'].apply(translate_age)

Categorize sex

In [6]:
df['SexAtBirth'] = df['SexAtBirth'].map({1: 'Man', 2: 'Vrouw'})

Categorize gender identity

In [7]:
def translate_gender_identity(row): #TODO: check if this corresponds to the questionnaire
    if row['GenderIdentitySelfReported'] == 1 and row['SexAtBirth'] == 'Man':
        return 'Man'
    elif row['GenderIdentitySelfReported'] == 2 and row['SexAtBirth'] == 'Vrouw':
        return 'Vrouw'
    elif row['GenderIdentitySelfReported'] == 2 and row['SexAtBirth'] == 'Man':
        return 'Trans vrouw'
    elif row['GenderIdentitySelfReported'] == 1 and row['SexAtBirth'] == 'Vrouw':
        return 'Trans man'
    elif row['GenderIdentitySelfReported'] == 3:
        return random.choice(['Genderqueer', 'Gender non-binair', 'Anders (specificeer)']) #if 'other' make a random choice between the three
    else:
        return 'Ik wil het liever niet zeggen'
    
df['GenderIdentity'] = df.apply(translate_gender_identity, axis=1)


Education mapping

In [8]:
education_mapping_snn11 = {
    1: 'Basisonderwijs',
    2: 'Basisonderwijs',
    3: 'Vmbo/Mavo',
    4: 'Vmbo/Mavo',
    5: 'Mbo (niveau 1-4)',
    6: 'Havo/Vwo',
    7: 'Havo/Vwo',
    8: 'Mbo (niveau 1-4)',
    9: 'Mbo (niveau 1-4)',
    10: 'Mbo (niveau 1-4)',
    11: 'Mbo (niveau 1-4)',
    12: 'HBO',
    13: 'HBO',
    14: 'HBO',
    15: 'Universiteit (Bachelor, Master, of hoger)',
    16: 'Universiteit (Bachelor, Master, of hoger)',
    17: 'Universiteit (Bachelor, Master, of hoger)',
    18: 'Universiteit (Bachelor, Master, of hoger)',
    5555: 'Anders (specificeer)',
    7777: 'Ik wil het liever niet zeggen',
    8888: 'Ik wil het liever niet zeggen',
    9999: 'Ik wil het liever niet zeggen'
}

df['EducationLevel'] = df['EducationLevelNL'].map(education_mapping_snn11).fillna('Anders (specificeer)')

Employment mapping

In [9]:
work_status_mapping_snn11 = {
    1: 'Betaalde baan (fulltime)',
    2: 'Scholier of student',
    3: 'Geen betaald werk om andere redenen',
    4: 'Geen betaald werk om andere redenen',
    5: 'Gedeeltelijk / geen betaald werk vanwege gezondheidsproblemen',
    6: 'Gepensioneerd of met prepensioen',
    7: 'Vrijwilligerswerk',
    8: 'Zorg voor het huishouden (eventueel kinderen)',
    9: 'Anders (specificeer)',
    66: 'Ik wil het liever niet zeggen',
    77: 'Ik wil het liever niet zeggen',
    88: 'Ik wil het liever niet zeggen',
    99: 'Ik wil het liever niet zeggen'
}

df['WorkStatus'] = df['WorkStatus'].map(education_mapping_snn11).fillna('Anders (specificeer)')

Religious mapping

In [10]:
religion_mapping_snn11 = {
    1: 'Christendom',
    2: 'Christendom',
    3: 'Christendom',
    4: 'Christendom',
    5: 'Christendom',
    6: 'Christendom',
    7: 'Christendom',
    8: 'Christendom',
    9: 'Christendom',
    10: 'Christendom',
    11: 'Christendom',
    12: 'Christendom',
    13: 'Christendom',
    14: 'Christendom',
    15: 'Christendom',
    16: 'Christendom',
    17: 'Christendom',
    18: 'Christendom',
    19: 'Christendom',
    20: 'Christendom',
    21: 'Christendom',
    22: 'Hindoeïsme',
    23: 'Boeddhisme',
    24: 'Anders (specificeer)',
    25: 'Jodendom',
    26: 'Islam',
    27: 'Anders (specificeer): Humanistisch Verbond',
    28: 'Anders (specificeer): Andere niet-christelijke religies',
    6666: 'Geen religie',
    7777: 'Ik wil het liever niet zeggen',
    9999: 'Ik wil het liever niet zeggen'
}


def map_religion(row):
    if row['ReligiousBelonging'] in [7, 8, 9]:
        return 'Ik wil het liever niet zeggen'
    elif row['ReligiousBelonging'] == 2:
        return 'Geen religie'
    elif row['ReligiousBelonging'] == 1:
        return religion_mapping_snn11.get(row['ReligiousDenominationNL'])
    else: 
        return 'Anders (specificeer): ' + str(row['ReligiousDenominationNL'])



df['Religion'] = df.apply(map_religion, axis=1)


Ethnicity and nationality variables

In [11]:
def check_for_unrecognized_countries(row, row_name):
        if row[row_name] == '1000':
            return 'DDR'
        elif row[row_name] == '2000':
            return 'USSR'
        elif row[row_name] == '3000':
            return 'Czechoslovakia'
        elif row[row_name] == '4000':
            return 'Yugoslavia'
        elif row[row_name] == '5000':
            return 'East-Timor'
        elif row[row_name] == '6000':
            return 'Serbia and Montenegro'
        elif row[row_name] == '6500':
            return 'Alien\'s passport'
        else:
            return row[row_name]

#country of birth
def infer_country_of_birth(row):
    if row['BornInNL'] == 1:
        return 'NL'
    else:
        return check_for_unrecognized_countries(row, 'CountryOfBirth')
    
df['CountryOfBirth'] = df.apply(infer_country_of_birth, axis=1)

def infer_mother_country_of_birth(row):
    if row['MotherBornInNL'] == 1:
        return 'NL'
    else:
        return check_for_unrecognized_countries(row, 'MotherCountryOfBirth')

df['MotherCountryOfBirth'] = df.apply(infer_mother_country_of_birth, axis=1)
    
def infer_father_country_of_birth(row):
    if row['FatherBornInNL'] == 1:
        return 'NL'
    else:
        return check_for_unrecognized_countries(row, 'FatherCountryOfBirth')
    
df['FatherCountryOfBirth'] = df.apply(infer_father_country_of_birth, axis=1)

#nationality
def infer_nationality(row):
    if row['CitizenOfNL'] == 1:
        return 'NL'
    else:
        return check_for_unrecognized_countries(row, 'CountryOfBirth')
    

df['Nationality'] = df.apply(infer_nationality, axis=1)


def find_dominant_country(countries):
    counts = Counter(countries)
    most_common = counts.most_common(1)
    return most_common[0][0] if most_common else None


with open('ancestry_mapping_snn11.json', 'r') as f:
    ancestry_mapping = json.load(f)
        

df['Ethnicity'] = df['ancestry1'].map(lambda x: ancestry_mapping.get(str(x), 'Anders (specificeer)'))


In [12]:
#display all rows
# pandas.set_option('display.max_rows', None)

#ethnicity check
# df[['FeelsSameEthnicGroup', 'CountryOfBirth', 'MotherCountryOfBirth', 'FatherCountryOfBirth','Nationality', 'Ethnicity', 'Religion']] #TODO: look if data is clean. Now country of birth values are not; check religion

Extract relevant data for the cohort

In [14]:
extracted_data  = df[['ParticipantID',
                     'AgeGroup',
                     'SexAtBirth',
                     'GenderIdentity',
                     'CountryOfBirth', 'Nationality', 'MotherCountryOfBirth', 'FatherCountryOfBirth', 'Ethnicity',
                     'Religion',
                     'EducationLevel',
                     'WorkStatus',
                     'FeltDepressed', 'LowEnergy', 'SleepProblems', 'FeltHappy', 'FeltLonely', 'EnjoyedLife', 'FeltSad', 'TroubleGettingGoing',]]

extracted_data
# Breakdown of WorkStatus values
work_summary = df['WorkStatus'].value_counts(dropna=False).to_frame(name='Count')
work_summary['Percentage'] = (work_summary['Count'] / work_summary['Count'].sum() * 100).round(2)
print(work_summary)

                    

                      Count  Percentage
WorkStatus                             
Anders (specificeer)    299       64.58
Basisonderwijs          108       23.33
Mbo (niveau 1-4)         31        6.70
Havo/Vwo                 22        4.75
Vmbo/Mavo                 3        0.65
