# Clean Questionnaire Data Screening

In [38]:
import json
import re
from pathlib import Path

import pandas as pd
import numpy as np
import pingouin as pg

import matplotlib.pyplot as plt
import seaborn as sns

from fau_colors import cmaps
import biopsykit as bp

from empkins_io.datasets.d03.micro_gapvii import MicroBaseDataset

%load_ext autoreload
%autoreload 2
%matplotlib widget

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
plt.close("all")

palette = sns.color_palette(cmaps.faculties)
sns.set_theme(context="notebook", style="ticks", palette=palette)

plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["pdf.fonttype"] = 42
plt.rcParams["mathtext.default"] = "regular"

palette

In [40]:
deploy_type = "local"

In [41]:
config_dict = json.load(Path("../config.json").open(encoding="utf-8"))

base_path = Path(config_dict[deploy_type]["base_path"])
base_path

PosixPath('/Volumes/luca_ssd/Study_Data/2022_12_GAPVII_Micro')

In [42]:
dataset = MicroBaseDataset(base_path)
dataset

Unnamed: 0,subject,condition,phase
0,VP_001,tsst,Prep
1,VP_001,tsst,Pause_1
2,VP_001,tsst,Talk
3,VP_001,tsst,Pause_2
4,VP_001,tsst,Pause_3
...,...,...,...
1691,VP_110,ftsst,Pause_2
1692,VP_110,ftsst,Pause_3
1693,VP_110,ftsst,Math
1694,VP_110,ftsst,Pause_4


## Merge 1.0 and 1.1 for all questionnaires

In [43]:
for time in ["screening", "pre", "post"]:
    # load 1.0
    data1 = pd.read_csv(base_path.joinpath(f"data_tabular/questionnaires/cleaned/{time}_1_0_cleaned.csv"), sep=";")

    # load 1.1
    data11 = pd.read_csv(base_path.joinpath(f"data_tabular/questionnaires/cleaned/{time}_1_1_cleaned.csv"), sep=";")

    # append
    data = pd.concat((data1,data11))

    # save
    data.to_excel(base_path.joinpath(f"data_tabular/questionnaires/processed/unipark_{time}.xlsx"))

In [44]:
code_mapping = dataset.code_mapping
code_mapping = code_mapping.reset_index().set_index("Code")
code_mapping.head()

Unnamed: 0_level_0,subject
Code,Unnamed: 1_level_1
LI17P,VP_001
IH30R,VP_002
HB25M,VP_003
SK29B,VP_004
BK24I,VP_005


## Load Questionnaire Data and Map Code to VP-ID

In [45]:
quest_data = pd.read_excel(base_path.joinpath("data_tabular/questionnaires/processed/unipark_screening.xlsx"))
quest_data = quest_data.rename(columns={"VPN_Kennung": "Code"})
quest_data = quest_data.set_index("Code")

quest_data = quest_data.join(code_mapping)

quest_data = quest_data.dropna(subset=["subject"]).set_index("subject")

quest_data

Unnamed: 0_level_0,Unnamed: 0,lfdn,external_lfdn,tester,dispcode,lastpage,quality,duration,Einverstaendnis_1,Einverstaendnis_2,...,FFA_09,FFA_10,FFA_11,FFA_12,FFA_13,FFA_14,v_510,rts7364387,rts7364406,rts7374062
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VP_074,73,1554,0,0,31,7176240,-77,533,1,1,...,-77.0,-77.0,-77.0,-77.0,-77,-77,-66,191.0,0.0,0.0
VP_074,207,1858,0,0,31,7176254,-77,1796,1,1,...,3.0,3.0,3.0,3.0,2,2,Erste Vorlesung biologische Psychologie,150.0,0.0,1794.0
VP_009,188,468,0,0,31,7176254,-77,1898,1,1,...,,,,,,,,,,
VP_033,329,823,0,0,31,7176254,-77,9010,1,1,...,,,,,,,,,,
VP_006,51,111,0,0,31,7176254,-77,1217,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VP_108,313,2119,0,0,31,7176254,-77,2492,1,1,...,3.0,2.0,2.0,2.0,2,4,Mir wurde ein flyer zugesendet aufgrund der sc...,150.0,0.0,2485.0
VP_083,167,1781,0,0,32,7176254,-77,-1,1,1,...,3.0,3.0,4.0,2.0,1,3,Flyer von einer Freundin (vermutlich Langemarc...,66210.0,0.0,88448.0
VP_014,106,225,0,0,31,7176254,-77,1831,1,1,...,,,,,,,,,,
VP_071,126,1695,0,0,31,7176254,-77,1691,1,1,...,2.0,3.0,3.0,3.0,2,3,Werbung in Whatsapp Gruppe,156.0,0.0,1685.0


In [46]:
# keep last entry for each subject
quest_data = quest_data.groupby("subject").last()

In [47]:
quest_data.replace(-77, np.nan, inplace=True)

In [48]:
quest_data

Unnamed: 0_level_0,Unnamed: 0,lfdn,external_lfdn,tester,dispcode,lastpage,quality,duration,Einverstaendnis_1,Einverstaendnis_2,...,FFA_09,FFA_10,FFA_11,FFA_12,FFA_13,FFA_14,v_510,rts7364387,rts7364406,rts7374062
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VP_001,28,57,0,0,31,7176254,,2793,1,1,...,,,,,,,,,,
VP_002,60,126,0,0,32,7176254,,-1,1,1,...,,,,,,,,,,
VP_003,24,51,0,0,32,7176254,,-1,1,1,...,,,,,,,,,,
VP_004,48,104,0,0,32,7176254,,-1,1,1,...,,,,,,,,,,
VP_005,78,165,0,0,32,7176254,,-1,1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VP_106,301,2085,0,0,31,7176254,,2408,1,1,...,3.0,2.0,2.0,3.0,2,3,Durch eine Freundin,180.0,0.0,2394.0
VP_107,289,2053,0,0,31,7176254,,1457,1,1,...,3.0,3.0,3.0,3.0,2,4,Vorlesung,127.0,0.0,1453.0
VP_108,313,2119,0,0,31,7176254,,2492,1,1,...,3.0,2.0,2.0,2.0,2,4,Mir wurde ein flyer zugesendet aufgrund der sc...,150.0,0.0,2485.0
VP_109,262,1987,0,0,31,7176254,,1438,1,1,...,4.0,4.0,4.0,4.0,1,3,bismarckstrasse 1,183.0,0.0,1433.0


In [49]:
# load renaming json
renaming_dict = json.load(Path("../../renaming.json").open(encoding="utf-8"))

renaming_dict

{'Geschlecht': 'Gender',
 'Alter': 'Age',
 'Geburtsdatum': 'Birth_Date',
 'Familienstand': 'Marital',
 'Bildungsabschluss': 'Education',
 'Beschaeftigungsverhaeltnis': 'Profession',
 'Beschaeftigungsverhaeltnis_Sonstige': 'Profession_Other',
 'Ethnische_Herkunft': 'Ethnicity',
 'Studiengang': 'Course_Study',
 'Berufsfeld': 'Professional_Field',
 'Muttersprache': 'Native_Language',
 'Koerpergroesse': 'Height',
 'Koerpergewicht': 'Weight',
 'Haendigkeit': 'Handedness',
 'Kontrazeptiva_Keine': 'Contraceptives_None',
 'Kontrazeptiva_Pille': 'Contraceptives_Pill',
 'Kontrazeptiva_hormonfrei': 'Contraceptives_Hormone_Free',
 'Kontrazeptiva_hormonell_andere': 'Contraceptives_Hormone_Other',
 'Menstruation_regelm_Zyklus': 'Menstruation_Regular_Cycle',
 'Menstruation_unregelm_Zyklus_Grund': 'Menstruation_Unregular_Cycle_Reason',
 'Menstruation_letzte_Regelblutung': 'Menstruation_Last_Period',
 'Zykluslaenge': 'Cycle_Length',
 'Menstruation_Zyklus_Schwankung': 'Menstruation_Cycle_Fluctuations',


In [50]:
quest_data = quest_data.rename(renaming_dict, axis=1)

# get only columns that  are in renaming dict and in the quest_data
columns = list(set(list(renaming_dict.values())) & set(quest_data.columns))

quest_data_filtered = quest_data[columns]

quest_data_filtered.head()

Unnamed: 0_level_0,Medication_Psychostimulants,Medication_Hormonal_Preparation,Disease_Acute_Symptoms,Disease_Liver,Education,Medication_Antidepressant,Birth_Date,Disease_Skeleton,Disease_Metabolism,Contraceptives_Pill,...,Ethnicity,Disease_Blood,Marital,Menstruation_Ovulation_Tracking,Disease_Nervous_System,Contraceptives_Hormone_Free,Disease_Kidney,Native_Language,Disease_Infect,Disease_Psychological
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VP_001,2,2,2,2,6,2,1998-01-07,2,2,0.0,...,5,2,1,,2,1.0,2,1,2,2
VP_002,2,2,2,2,6,2,1997-10-18,2,2,0.0,...,5,2,2,,2,1.0,2,1,2,2
VP_003,2,2,2,2,5,2,2002-12-13,2,2,0.0,...,5,2,1,,2,0.0,2,1,2,2
VP_004,2,2,2,2,5,2,2001-03-03,2,2,,...,5,2,1,,2,,2,1,2,2
VP_005,2,2,2,2,6,2,1996-10-20,2,2,0.0,...,5,2,2,,2,1.0,2,1,2,2


In [51]:
# replace all missing values
quest_data_filtered = quest_data_filtered.replace("-66", np.nan)
quest_data_filtered = quest_data_filtered.replace(-77, np.nan)

In [52]:
quest_data_filtered

Unnamed: 0_level_0,Medication_Psychostimulants,Medication_Hormonal_Preparation,Disease_Acute_Symptoms,Disease_Liver,Education,Medication_Antidepressant,Birth_Date,Disease_Skeleton,Disease_Metabolism,Contraceptives_Pill,...,Ethnicity,Disease_Blood,Marital,Menstruation_Ovulation_Tracking,Disease_Nervous_System,Contraceptives_Hormone_Free,Disease_Kidney,Native_Language,Disease_Infect,Disease_Psychological
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VP_001,2,2,2,2,6,2,1998-01-07,2,2,0.0,...,5,2,1,,2,1.0,2,1,2,2
VP_002,2,2,2,2,6,2,1997-10-18,2,2,0.0,...,5,2,2,,2,1.0,2,1,2,2
VP_003,2,2,2,2,5,2,2002-12-13,2,2,0.0,...,5,2,1,,2,0.0,2,1,2,2
VP_004,2,2,2,2,5,2,2001-03-03,2,2,,...,5,2,1,,2,,2,1,2,2
VP_005,2,2,2,2,6,2,1996-10-20,2,2,0.0,...,5,2,2,,2,1.0,2,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VP_106,2,2,2,2,5,2,01.03.04,2,2,0.0,...,5,2,1,1.0,2,1.0,2,1,2,2
VP_107,2,2,2,2,5,2,02.04.00,2,2,0.0,...,5,2,2,1.0,2,1.0,2,1,2,2
VP_108,2,2,2,2,5,2,01.10.96,2,2,,...,5,2,1,,2,,2,1,2,2
VP_109,2,2,2,2,6,2,11.02.99,2,2,,...,1,2,1,,2,,2,1,2,2


## Add Condition Order

In [53]:
condition_order = pd.read_csv(base_path.joinpath("data_tabular/extras/processed/condition_order.csv"))
condition_order = condition_order.set_index("subject")["condition_order"]
condition_order.head()

# change condtion order to codes
condition_order = condition_order.replace("tsst_first", 1)
condition_order = condition_order.replace("ftsst_first", 2)

quest_data_filtered.insert(1, "Condition_Order", condition_order)

In [54]:
quest_data_filtered

Unnamed: 0_level_0,Medication_Psychostimulants,Condition_Order,Medication_Hormonal_Preparation,Disease_Acute_Symptoms,Disease_Liver,Education,Medication_Antidepressant,Birth_Date,Disease_Skeleton,Disease_Metabolism,...,Ethnicity,Disease_Blood,Marital,Menstruation_Ovulation_Tracking,Disease_Nervous_System,Contraceptives_Hormone_Free,Disease_Kidney,Native_Language,Disease_Infect,Disease_Psychological
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VP_001,2,2,2,2,2,6,2,1998-01-07,2,2,...,5,2,1,,2,1.0,2,1,2,2
VP_002,2,2,2,2,2,6,2,1997-10-18,2,2,...,5,2,2,,2,1.0,2,1,2,2
VP_003,2,2,2,2,2,5,2,2002-12-13,2,2,...,5,2,1,,2,0.0,2,1,2,2
VP_004,2,1,2,2,2,5,2,2001-03-03,2,2,...,5,2,1,,2,,2,1,2,2
VP_005,2,1,2,2,2,6,2,1996-10-20,2,2,...,5,2,2,,2,1.0,2,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VP_106,2,1,2,2,2,5,2,01.03.04,2,2,...,5,2,1,1.0,2,1.0,2,1,2,2
VP_107,2,2,2,2,2,5,2,02.04.00,2,2,...,5,2,2,1.0,2,1.0,2,1,2,2
VP_108,2,1,2,2,2,5,2,01.10.96,2,2,...,5,2,1,,2,,2,1,2,2
VP_109,2,1,2,2,2,6,2,11.02.99,2,2,...,1,2,1,,2,,2,1,2,2


In [209]:
# add to codebook

codebook = pd.read_csv(base_path.joinpath("data_tabular/questionnaires/codebook.csv"))
codebook.set_index("variable", inplace=True)

In [210]:
codebook

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Gender,female,male,other,,,,,,,,,,
Education,kein Schulabschluss,Hauptschule,Mittlere Reife,Fachabitur,Abitur,Bachelor,Master/Diplom,Promotion,Habilitation,,,,
Marital,ledig,feste Partnerschaft,verheiratet,getrennt lebend,geschieden,verwitwet,,,,,,,
Profession,Arbeitslos/Arbeitssuchend,Auszubildende/r,Student/in,Hausfrau/mann,Angestellte/r,Beamte/in,Selbstständig,Elternzeit,Beurlaubt,Sabbatjahr,Berufsunfähig/Frührente,Ruhestand,sonstiges
Ethnicity,Asiatisch,Arabisch,Afrikanisch,Indigen,Weiß,Andere,Keine Antwort,,,,,,
Condition_Order,tsst_first,ftsst_first,,,,,,,,,,,
Handedness,right,left,,,,,,,,,,,


In [211]:
# add row for condition order with nan values
codebook.loc["Condition_Order"] = np.nan

codebook.loc["Condition_Order", "1"] = "tsst_first"
codebook.loc["Condition_Order", "2"] = "ftsst_first"

In [212]:
# export
codebook.to_csv(base_path.joinpath("data_tabular/questionnaires/codebook.csv"))

## Export Base

In [55]:
quest_data_filtered.sort_index(axis=1, inplace=True)
quest_data_filtered.to_csv(base_path.joinpath("data_tabular/questionnaires/final/base_data.csv"))

## PASA

In [59]:
pasa_data = pd.read_csv(base_path.joinpath("data_tabular/questionnaires/processed/pasa.csv"))
pasa_data = pasa_data.set_index(["subject", "condition"])


pasa_data.columns = pasa_data.columns.astype(str).str.zfill(2)
pasa_data = pasa_data.unstack("condition")

pasa_data = pasa_data.sort_index(axis=1, level="condition")

# flatten multiindex
pasa_data.columns = ["_".join(col).strip() for col in pasa_data.columns.values]

pasa_data.head()


Unnamed: 0_level_0,PASA_01_ftsst,PASA_02_ftsst,PASA_03_ftsst,PASA_04_ftsst,PASA_05_ftsst,PASA_06_ftsst,PASA_07_ftsst,PASA_08_ftsst,PASA_09_ftsst,PASA_10_ftsst,...,PASA_07_tsst,PASA_08_tsst,PASA_09_tsst,PASA_10_tsst,PASA_11_tsst,PASA_12_tsst,PASA_13_tsst,PASA_14_tsst,PASA_15_tsst,PASA_16_tsst
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VP_001,5.0,5.0,4.0,4.0,4.0,2.0,1.0,4.0,4.0,3.0,...,1.0,4.0,3.0,2.0,3.0,4.0,4.0,5.0,3.0,4.0
VP_002,6.0,4.0,2.0,4.0,4.0,4.0,2.0,3.0,6.0,3.0,...,1.0,4.0,4.0,3.0,1.0,5.0,1.0,4.0,1.0,4.0
VP_003,3.0,4.0,5.0,5.0,4.0,2.0,1.0,4.0,3.0,3.0,...,4.0,5.0,3.0,3.0,3.0,5.0,4.0,4.0,3.0,4.0
VP_004,5.0,2.0,5.0,2.0,2.0,5.0,1.0,3.0,4.0,4.0,...,3.0,3.0,5.0,3.0,5.0,3.0,3.0,4.0,4.0,2.0
VP_005,5.0,4.0,4.0,4.0,3.0,2.0,2.0,4.0,5.0,4.0,...,2.0,3.0,2.0,2.0,4.0,3.0,4.0,5.0,4.0,3.0


In [60]:
pasa_data.to_csv(base_path.joinpath("data_tabular/questionnaires/final/pasa.csv"))

## ADS-L

In [61]:
quest_data.filter(like="ADS").to_csv(base_path.joinpath("data_tabular/questionnaires/final/ads.csv"))

## STADI

In [62]:
quest_data.filter(like="STADI").to_csv(base_path.joinpath("data_tabular/questionnaires/final/stadi.csv"))

## Brief Cope

In [63]:
quest_data.filter(like="Brief").to_csv(base_path.joinpath("data_tabular/questionnaires/final/brief_cope.csv"))

## PSS

In [64]:
quest_data.filter(like="PSS").to_csv(base_path.joinpath("data_tabular/questionnaires/final/pss.csv"))

## BFIK

In [65]:
quest_data.filter(like="BFIK").to_csv(base_path.joinpath("data_tabular/questionnaires/final/bfik.csv"))

## RSE

In [66]:
quest_data.filter(like="RSE").to_csv(base_path.joinpath("data_tabular/questionnaires/final/rse.csv"))

## SCS

In [67]:
quest_data.filter(like="SCS").to_csv(base_path.joinpath("data_tabular/questionnaires/final/scs.csv"))

## RSQ

In [68]:
quest_data.filter(like="RSQ").to_csv(base_path.joinpath("data_tabular/questionnaires/final/rsq.csv"))

## BES

In [69]:
quest_data.filter(like="BES").to_csv(base_path.joinpath("data_tabular/questionnaires/final/bes.csv"))

## SOC

In [70]:
quest_data.filter(like="SOC").to_csv(base_path.joinpath("data_tabular/questionnaires/final/soc.csv"))

## TSGS

In [71]:
quest_data.filter(like="TSGS").to_csv(base_path.joinpath("data_tabular/questionnaires/final/tsgs.csv"))