<a href="https://colab.research.google.com/github/dsliwka/EEMP2024/blob/main/Notebooks/LPPEngaDataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from scipy import stats


## Read LPP Data
path_to_data = "https://raw.githubusercontent.com/dsliwka/EEMP2024/main/Data/LPP-CF_1215_v1.csv"
df = pd.read_csv(path_to_data)


## Keep only wave 2 (as here the LPP campus file has more information)
dfp=df[df.welle==2]

## Generate Engagement Scale
dfp=dfp.assign(enga=6-df.filter(regex="menga*").mean(axis=1))

## reverse health scale (as lower numbers represent better health in the survey)
dfp["mgesund_allg"]=6-dfp.mgesund_allg

## keep only rows where we have non-missing data on income, industry & engagement
dfp=dfp[dfp.meink_kateg.isnull()==False]
dfp=dfp[dfp.branche.isnull()==False]
dfp=dfp[dfp.enga.isnull()==False]

## Generate Big5 traits from items
dfp['conscientiousness']= (6-dfp.mbig_gruend +dfp.mbig_faul     + 6-dfp.mbig_effi)/3
dfp['extraversion']=      (6-dfp.mbig_komm   +dfp.mbig_zur      + 6-dfp.mbig_gesell)/3
dfp['neuroticism']=       (6-dfp.mbig_sorgen +dfp.mbig_entsp    + 6-dfp.mbig_nervoes)/3
dfp['openness']=          (6-dfp.mbig_origi	+6-dfp.mbig_kunst  + 6-dfp.mbig_phant  + 6-dfp.mbig_wissb)/4
dfp['agreeableness']=     (dfp.mbig_grob	    +6-dfp.mbig_verzeih+ 6-dfp.mbig_freundl)/3

## drop all rows in which either of the big5 traits are missing
dfp=dfp.dropna(subset=['conscientiousness', 'extraversion', 'neuroticism', 'openness', 'agreeableness'])

## Standardize all personality traits & engagement (i.e. substract the mean and divide by SD)
## Note: for simplicity we here use a function from package scipy
dfp['conscientiousness'] = stats.zscore(dfp.conscientiousness)
dfp['extraversion'] = stats.zscore(dfp.extraversion)
dfp['neuroticism'] = stats.zscore(dfp.neuroticism)
dfp['openness'] = stats.zscore(dfp.openness)
dfp['agreeableness'] = stats.zscore(dfp.agreeableness)
dfp['enga'] = stats.zscore(dfp.enga)

## drop all columns with missing data:
dfp = dfp.dropna(axis=1)

## keep only subset of the variables
dfp=dfp[['enga','branche', 'ost', 'size',
       'bauswahl_interview', 'bauswahl_ac', 'bauswahl_kognit', 'bauswahl_pers',
       'bauswahl_arbeitsprobe', 'bauswahl_andere', 'bmagespr', 'bzv',
       'bentwplan', 'bbeurt', 'bvargehalt', 'bsonderz_nv', 'bmabefr',
       'bmabind_flexaz', 'bmabind_verg', 'bhomeoff', 'babsent_anz', 'msex',
       'alter', 'mstib', 'mleitung', 'maz_voll_teil', 'mheim', 'maz_freizeit',
       'mwb', 'mmagespr', 'mikt_nutz', 'mgesund_allg', 'mkind_anz_gr',
       'mkindu3','meink_kateg', 'conscientiousness','extraversion','neuroticism','openness','agreeableness']]

## As variable "brache" which contains the industry in which a person is working
## is categorial generate dummy variables for each industry
dfp =pd.get_dummies(dfp, columns=['branche'])

## inspect how many observations are remaining:
print("We have", dfp.shape[0], "observations and", dfp.shape[1], "variables.")


We have 886 observations and 44 variables.


In [None]:
##
## Note: Use the "cleaned" DataFrame dfp for your prediction task!
##

