# Import

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None

# Load

In [None]:
data = pd.read_csv('8_1_raw.csv', sep=';', low_memory=False)
data.drop(['n2labor_c_recordid', 'labor001_c_n2leistid'], axis=1, inplace=True)
data.rename(columns={'labor001_c_n2kattext': 'type', 'c_n2value': 'value', 'c_n2unit': 'unit', 'lab_datetime': 'timestamp', 'Fallnummer': 'case'}, inplace=True)
data = data[['case', 'timestamp', 'type', 'value', 'unit']]
data

# Check

In [None]:
data[data['type'] == '']

# Process

In [None]:
def append(name: str, data: pd.DataFrame, lab_data: pd.DataFrame, types: [str], units: [str] = []) -> pd.DataFrame:
    subsets = []
    for type in types:
        type_data = data[data['type'] == type]
        print(f'{type}: {len(type_data)} {type_data["unit"].unique()}')
        subsets.append(type_data)
    
    subset = pd.concat(subsets, ignore_index=True)
    if len(units) > 0: subset = subset[subset['unit'].isin(units)]

    subset['type'] = name
    print(f'Total: {len(subset)} {subset["unit"].unique()}')

    return pd.concat([lab_data, subset], ignore_index=True)

In [None]:
lab_data = pd.DataFrame(columns=['case', 'timestamp', 'type', 'value'])

In [None]:
# 25 OH Vitamin D3 = 25 OH Vitamin B3
lab_data = append('vd25', data, lab_data, ['25-Hydroxy-Vitamin D3', '25-OH-Vitamin D3', '25-OH-Vitamin D3 Se'])

In [None]:
# Base Excess = Basenüberschuss
lab_data = append('be', data, lab_data, ['ABE', 'Base Excess', 'BasenÃ¼berschuÃ', 'SBE'])

In [None]:
# Antithrombin = Antithrombin
lab_data = append('at', data, lab_data, ['AT3', 'Antithrombin', 'Antithrombin  AktivitÃ¤t'])

In [None]:
# Albumin = Albumin
lab_data = append('alb', data, lab_data, ['Albumin', 'Albumin (HP)', 'Albumin HP', 'Albumin Se', 'Albumin i.Se'], ['g/l'])

In [None]:
# Alkaline Phosphatase = Alkalische Phosphatase
lab_data = append('alp', data, lab_data, ['Alk. Phosphatase', 'Alk. Phosphatase (HP)', 'Alk.Phosphatase', 'Alk.Phosphatase HP', 'Alk.Phosphatase Se'])

In [None]:
# Ammonia = Ammoniak
lab_data = append('nh3', data, lab_data, ['Ammoniak', 'Ammoniak (EDTA)', 'Ammoniak EDTA'])

In [None]:
# Amylase = Amylase
lab_data = append('ams', data, lab_data, ['Amylase', 'Amylase HP', 'Amylase Se'], ['U/l'])

In [None]:
# Basophils Absolute = Basophile Absolut
lab_data = append('baso', data, lab_data, ['Basophile absolut'])

baso = data[(data['type'] == 'Basophile') & (data['unit'] == '/nl')]
baso['type'] = 'baso'
lab_data = pd.concat([lab_data, baso], ignore_index=True)

In [None]:
# Basophils Relative = Basophile Relativ
lab_data = append('baso_rel', data, lab_data, ['Basophile %'])

baso_rel = data[(data['type'] == 'Basophile') & (data['unit'] == '%')]
baso_rel['type'] = 'baso_rel'
lab_data = pd.concat([lab_data, baso_rel], ignore_index=True)

In [None]:
# Total Bilirubin = Gesamt Bilirubin
lab_data = append('tbil', data, lab_data, ['Bilirubin', 'Bilirubin gesamt Se', 'Bilirubin, gesamt', 'Bilirubin, gesamt HP', 'Bilirubin, total', 'Bilirubin, total (HP)', 'tBil'], ['mg/dl', 'mg/dL'])

In [None]:
# Bilirubin Direct = Bilirubin Direkt
lab_data = append('dbil', data, lab_data, ['Bilirubin direkt Se', 'Bilirubin, conjugiert', 'Bilirubin, direkt', 'Bilirubin, direkt (HP)', 'Bilirubin, direkt HP'])

In [None]:
# Bilirubin Indirect = Bilirubin Indirekt
lab_data = append('ibil', data, lab_data, ['Bilirubin indirekt'])

In [None]:
# - = Potenzial von Wasserstoff
lab_data = append('ph', data, lab_data, ['Blut-pH-Wert', 'pH', 'pH-Wert'])

In [None]:
# Creatine Kinase = Kreatin Kinase
lab_data = append('ck', data, lab_data, ['CK', 'CK (HP)', 'Creatinkinase (CK)', 'Creatinkinase (CK) HP', 'Creatinkinase (CK) Se'])

In [None]:
# Creatine Kinase MB = Kreatin Kinase MB
lab_data = append('ck_mb', data, lab_data, ['CK-MB', 'CK-MB (HP)', 'CK-MB HP', 'CK-MB Se'])

In [None]:
# Carboxyhemoglobin = Carboxyhämoglobin
lab_data = append('cohb', data, lab_data, ['CO-Hb', 'COHb'])

In [None]:
# C Reactive Protein = C Reaktives Protein
lab_data = append('crp', data, lab_data, ['CRP', 'CRP (HP)', 'CRP HP', 'CRP Se'], ['mg/l'])

In [None]:
# Calcium = Kalzium
lab_data = append('ca', data, lab_data, ['Ca++', 'Calcium', 'Calcium (HP)', 'Calcium Se'], ['mmol/L', 'mmol/l'])

In [None]:
# Chloride = Chlorid
lab_data = append('cl', data, lab_data, ['Chlorid', 'Chlorid (HP)', 'Chlorid Se', 'Cl-'])

In [None]:
# Total Cholesterol = Gesamt Cholesterin
lab_data = append('tc', data, lab_data, ['Cholesterin', 'ges.Cholesterin', 'ges.Cholesterin HP', 'ges.Cholesterin Se'])

In [None]:
# Creatinine = Kreatinin
lab_data = append('cr', data, lab_data, ['Creatinin', 'Creatinin (enz)', 'Creatinin (enzymat.)', 'Kreatinin', 'Kreatinin (JaffÃ©)', 'Kreatinin (JaffÃ©) (HP)', 'Kreatinin (JaffÃ©) HP', 'Kreatinin (JaffÃ©) Se', 'Kreatinin (enzym.)', 'Kreatinin (enzym.) HP', 'Kreatinin (enzym.) Se'], ['mg/dl'])

In [None]:
# Cystatin C = Cystatin C
lab_data = append('cys_c', data, lab_data, ['Cystatin C', 'Cystatin C HP', 'Cystatin C Se'], ['mg/l'])

In [None]:
# D Dimer = D Dimere
lab_data = append('d_dim', data, lab_data, ['D-Dimer', 'D-Dimere'])

In [None]:
# Iron = Eisen
lab_data = append('fe', data, lab_data, ['Eisen', 'Eisen (HP)', 'Eisen Se'])

In [None]:
# Eosinophils = Eosinophile
lab_data = append('eos', data, lab_data, ['Eosinophile absolut'])

eos = data[(data['type'] == 'Eosinophile') & (data['unit'] == '/nl')]
eos['type'] = 'eos'
lab_data = pd.concat([lab_data, eos], ignore_index=True)

In [None]:
# Eosinophils Relative = Eosinophile Relativ
lab_data = append('eos_rel', data, lab_data, ['Eosinophile %'])

eos_rel = data[(data['type'] == 'Eosinophile') & (data['unit'] == '%')]
eos_rel['type'] = 'eos_rel'
lab_data = pd.concat([lab_data, eos_rel], ignore_index=True)

In [None]:
# Erythroblasts = Erythroblasten
lab_data = append('ebl', data, lab_data, ['Erythroblasten absolut'])

ebl = data[(data['type'] == 'Erythroblasten') & (data['unit'] == '/nl')]
ebl['type'] = 'ebl'
lab_data = pd.concat([lab_data, ebl], ignore_index=True)

In [None]:
# Erythroblasts Relative = Erythroblasten Relativ
lab_data = append('ebl_rel', data, lab_data, ['Erythroblasten %'])

ebl_rel = data[(data['type'] == 'Erythroblasten') & (data['unit'] == '%')]
ebl_rel['type'] = 'ebl_rel'
lab_data = pd.concat([lab_data, ebl_rel], ignore_index=True)

In [None]:
# Erythrocytes = Erythrozyten
lab_data = append('rbc', data, lab_data, ['Erythrozyten'], ['/pl'])

In [None]:
# Fraction of Inspired Oxygen = Inspiratorische Sauerstofffraktion
lab_data = append('fio2', data, lab_data, ['FIO2'])

In [None]:
# Ferritin = Ferritin
lab_data = append('fer', data, lab_data, ['Ferritin', 'Ferritin HP', 'Ferritin SE', 'Ferritin Se'])

In [None]:
# Fibrinogen = Fibrinogen
lab_data = append('fg', data, lab_data, ['Fibrinogen'], ['g/l'])

In [None]:
# Schistocytes = Fragmentozyten
lab_data = append('schisto', data, lab_data, ['Fragmentozyten'], ['%'])

In [None]:
# Gamma Glutamyltransferase = Gamma Glutamyltransferase
lab_data = append('ggt', data, lab_data, ['GGT', 'GGT (HP)', 'gamma-GT', 'gamma-GT HP', 'gamma-GT Se'])

In [None]:
# Glutamate Dehydrogenase = Glutamat Dehydrogenase
lab_data = append('gdh', data, lab_data, ['GLDH', 'GLDH HP', 'GLDH Se'])

In [None]:
# Glucose = Glukose
lab_data = append('glu', data, lab_data, ['GLU', 'Glu', 'Glucose', 'Glucose HP', 'Glucose Se'], ['mg/dl', 'mg/dL'])

In [None]:
# ASAT (GOT) = ASAT (GOT)
lab_data = append('asat', data, lab_data, ['GOT (AST)', 'GOT (AST) (HP)', 'GOT (AST) HP', 'GOT (AST) Se'])

In [None]:
# ALAT (GPT) = ALAT (GPT)
lab_data = append('alat', data, lab_data, ['GPT (ALT)', 'GPT (ALT) (HP)', 'GPT (ALT) HP', 'GPT (ALT) Se'])

In [None]:
# Bicarbonate = Bikarbonat
lab_data = append('hco3', data, lab_data, ['HCO3-', 'SBC', 'Standard Bicarbonat', 'Standardbicarbonat', 'aktuelles Bicarbonat'])

In [None]:
# High Density Lipoprotein = HDL Cholesterin
lab_data = append('hdl', data, lab_data, ['HDL-Cholesterin', 'HDL-Cholesterin HP', 'HDL-Cholesterin Se'])

In [None]:
# Deoxyhemoglobin = Desoxyhämoglobin
lab_data = append('hhb', data, lab_data, ['HHb'])

In [None]:
# Haptoglobin = Haptoglobin
lab_data = append('hp', data, lab_data, ['Haptoglobin', 'Haptoglobin HP', 'Haptoglobin Se'], ['g/l'])

In [None]:
# Urea = Harnstoff
lab_data = append('urea', data, lab_data, ['Harnstoff', 'Harnstoff (HP)', 'Harnstoff HP', 'Harnstoff Se'], ['mg/dl'])

In [None]:
# Uric Acid = Harnsäure
lab_data = append('ua', data, lab_data, ['HarnsÃ¤ure', 'HarnsÃ¤ure (HP)', 'HarnsÃ¤ure HP', 'HarnsÃ¤ure Se'], ['mg/dl'])

In [None]:
# Hemoglobin = Hämoglobin
lab_data = append('hb', data, lab_data, ['Hb', 'HÃ¤moglobin', 'tHb'], ['g/dl', 'g/dL'])

In [None]:
# Glycated Hemoglobin = Glykosyliertes Hämoglobin
lab_data = append('hba1c', data, lab_data, ['HbA1c', 'HbA1c (EDTA)'])

In [None]:
# Hematocrit = Hämatokrit
lab_data = append('hct', data, lab_data, ['Hct', 'HÃ¤matokrit', 'HÃ¤matokrit (l/l)'], ['%'])

In [None]:
# I/T Ratio = I/T Quotient
lab_data = append('it_ratio', data, lab_data, ['I/T Quotient maschinell'])

In [None]:
# International Normalized Ratio = International Normalized Ratio
lab_data = append('inr', data, lab_data, ['INR', 'TPZ-INR'])

In [None]:
# Immature Platelet Fraction = Unreife Thrombozytenfraktion
lab_data = append('ipf', data, lab_data, ['Immature PlÃ¤ttchenfraktion'])

In [None]:
# Immunoglobulin A = Immunoglobulin A
lab_data = append('iga', data, lab_data, ['Immunglobulin A', 'Immunglobulin A HP', 'Immunglobulin A Se'], ['g/l'])

In [None]:
# Immunoglobulin E = Immunoglobulin E
lab_data = append('ige', data, lab_data, ['Immunglobulin E', 'Immunglobulin E HP', 'Immunglobulin E Se'], ['kU/l'])

In [None]:
# Immunoglobulin G = Immunoglobulin G
lab_data = append('igg', data, lab_data, ['Immunglobulin G', 'Immunglobulin G HP', 'Immunglobulin G Se'], ['g/l'])

In [None]:
# Immunoglobulin M = Immunoglobulin M
lab_data = append('igm', data, lab_data, ['Immunglobulin M', 'Immunglobulin M HP', 'Immunglobulin M Se'], ['g/l'])

In [None]:
# Potassium = Kalium
lab_data = append('k', data, lab_data, ['K+', 'Kalium', 'Kalium HP', 'Kalium Se'], ['mmol/L', 'mmol/l'])

In [None]:
# Lactate Dehydrogenase = Laktatdehydrogenase
lab_data = append('ldh', data, lab_data, ['LDH', 'LDH (HP)', 'LDH HP', 'LDH Se'])

In [None]:
# Low Density Lipoprotein = LDL Cholesterin
lab_data = append('ldl', data, lab_data, ['LDL-Cholesterin', 'LDL-Cholesterin HP', 'LDL-Cholesterin Se'])

In [None]:
# Lactate = Laktat
lab_data = append('lac', data, lab_data, ['Lac', 'Lactat', 'Laktat'])

In [None]:
# Leukocytes = Leukozyten
lab_data = append('wbc', data, lab_data, ['Leukozyten'], ['/nl'])

In [None]:
# Lipase = Lipase
lab_data = append('lps', data, lab_data, ['Lipase', 'Lipase (HP)', 'Lipase HP', 'Lipase Se'])

In [None]:
# Lymphocytes = Lymphocytes
lab_data = append('lym', data, lab_data, ['Lymphozyten abs.', 'Lymphozyten absolut'], ['/nl'])

lym = data[(data['type'] == 'Lymphozyten') & (data['unit'] == '/nl')]
lym['type'] = 'lym'
lab_data = pd.concat([lab_data, lym], ignore_index=True)

In [None]:
# Lymphocytes Relative = Lymphocytes Relativ
lab_data = append('lym_rel', data, lab_data, ['Lymphozyten %', 'Lymphozyten rel.'])

lym_rel = data[(data['type'] == 'Lymphozyten') & (data['unit'] == '%')]
lym_rel['type'] = 'lym_rel'
lab_data = pd.concat([lab_data, lym_rel], ignore_index=True)

In [None]:
# Mean Corpuscular Hemoglobin = Mittleres Korpuskulares Hämoglobin
lab_data = append('mch', data, lab_data, ['MCH'])

In [None]:
# Mean Corpuscular Hemoglobin Concentration = Mittlere Korpusukuläre Hämoglobin Konzentration
lab_data = append('mchc', data, lab_data, ['MCHC'])

In [None]:
# Mean Corpuscular Volume = Mittleres Korpuskuläres Volumen
lab_data = append('mcv', data, lab_data, ['MCV'])

In [None]:
# Mean Platelet Volume = Mittleres Thrombozytenvolumen
lab_data = append('mpv', data, lab_data, ['MPV'])

In [None]:
# Magnesium = Magnesium
lab_data = append('mg', data, lab_data, ['Magnesium', 'Magnesium (HP)', 'Magnesium Se'])

In [None]:
# Methemoglobin = Methämoglobin
lab_data = append('methb', data, lab_data, ['MetHb'])

In [None]:
# Monocytes = Monocytes
lab_data = append('mono', data, lab_data, ['Monozyten abs.', 'Monozyten absolut'])

mono = data[(data['type'] == 'Monozyten') & (data['unit'] == '/nl')]
mono['type'] = 'mono'
lab_data = pd.concat([lab_data, mono], ignore_index=True)

In [None]:
# Monocytes Relative = Monocytes Relativ
lab_data = append('mono_rel', data, lab_data, ['Monozyten %', 'Monozyten rel.'])

mono_rel = data[(data['type'] == 'Monozyten') & (data['unit'] == '%')]
mono_rel['type'] = 'mono_rel'
lab_data = pd.concat([lab_data, mono_rel], ignore_index=True)

In [None]:
# Myelocytes = Myelozyten
lab_data = append('myelo', data, lab_data, ['Myelozyten'])

In [None]:
# Myoglobin = Myoglobin
lab_data = append('mb', data, lab_data, ['Myoglobin', 'Myoglobin HP', 'Myoglobin Se'], ['Âµg/l'])

In [None]:
# N Terminal Pro B Type Natriuretic Peptide = N Terminal Pro B Type Natriuretic Peptide
lab_data = append('nt_probnp', data, lab_data, ['NT pro BNP', 'NT-pro BNP', 'NT-pro BNP (HP)'], ['ng/l'])

In [None]:
# Sodium = Natrium
lab_data = append('na', data, lab_data, ['Na+', 'Natrium', 'Natrium HP', 'Natrium Se'], ['mmol/L', 'mmol/l'])

In [None]:
# Neutrophils = Neutrophile
lab_data = append('pmn', data, lab_data, ['Neutrophile absolut'])

pmn = data[(data['type'] == 'Neutrophile') & (data['unit'] == '/nl')]
pmn['type'] = 'pmn'
lab_data = pd.concat([lab_data, pmn], ignore_index=True)

In [None]:
# Neutrophils Relative = Neutrophile Relativ
lab_data = append('pmn_rel', data, lab_data, ['Neutrophile %'])

pmn_rel = data[(data['type'] == 'Neutrophile') & (data['unit'] == '%')]
pmn_rel['type'] = 'pmn_rel'
lab_data = pd.concat([lab_data, pmn_rel], ignore_index=True)

In [None]:
# Oxygen Saturation = Sauerstoffsättigung
lab_data = append('so2', data, lab_data, ['O2-SÃ¤ttigung', 'sO2'])

In [None]:
# Oxyhemoglobin = Oxyhämoglobin
lab_data = append('o2hb', data, lab_data, ['O2Hb'])

In [None]:
# Phosphorus = Phosphor
lab_data = append('p', data, lab_data, ['Phosphor, anorg.'], ['mmol/l'])

In [None]:
# Procalcitonin = Procalcitonin
lab_data = append('pct', data, lab_data, ['Procalcitonin', 'Procalcitonin (HP)', 'Procalcitonin HP', 'Procalcitonin Se'])

In [None]:
# Protein = Protein
lab_data = append('pro', data, lab_data, ['Protein', 'Protein HP'], ['g/l'])

In [None]:
# Pseudocholinesterase = Pseudocholinesterase
lab_data = append('pche', data, lab_data, ['PCHE', 'PCHE (HP)', 'Pseudo-Cholinesterase', 'Pseudo-Cholinesterase HP', 'Pseudo-Cholinesterase Se'])

In [None]:
# Quick Value = Quick Wert
lab_data = append('quick', data, lab_data, ['Quick (TPZ)'])

In [None]:
# Red Cell Distribution Width = Erythrozytenverteilungsbreite
lab_data = append('rdw', data, lab_data, ['RDW', 'RDW-CV'])

In [None]:
# Reticulocytes = Retikulozyten
lab_data = append('rtic', data, lab_data, ['Retikulozyten'], ['/nl'])

In [None]:
# Temperature = Temperatur
lab_data = append('t', data, lab_data, ['T', 'Temperatur'])

In [None]:
# Prothrombin Time = Thromboplastinzeit
lab_data = append('pt', data, lab_data, ['TPZ-Wert'])

In [None]:
# Thyroid Stimulating Hormone = Schilddrüsenstimulierendes Hormon
lab_data = append('tsh', data, lab_data, ['TSH', 'TSH bas.', 'TSH bas. Se', 'TSH bas. i.Se', 'TSH basal', 'TSH basal (HP)', 'TSH basal Se'])

In [None]:
# Platelets = Thrombozyten
lab_data = append('plt', data, lab_data, ['Thrombozyten'])

In [None]:
# Transferrin = Transferrin
lab_data = append('trans', data, lab_data, ['Transferrin', 'Transferrin HP', 'Transferrin Se'], ['g/l'])

In [None]:
# Transferrin Saturation = Transferrinsättigung
lab_data = append('ts', data, lab_data, ['Transferrin-SÃ¤ttigung', 'Transferrin-SÃ¤ttigung HP', 'Transferrin-SÃ¤ttigung Se', 'TransferrinsÃ¤ttigung'])

In [None]:
# Total Triglycerides = Gesamt Triglyceride
lab_data = append('tg', data, lab_data, ['Triglyceride', 'Triglyceride HP', 'Triglyceride Se'])

In [None]:
#   = Partielle Thromboplastinzeit
lab_data = append('aptt', data, lab_data, ['aPTT'])

In [None]:
# Phosphate = Phosphat
lab_data = append('po4', data, lab_data, ['anorg. PO4  HP', 'anorg. PO4  Se'])

In [None]:
# Carbon Dioxide Partial Pressure = Kohlendioxidpartialdruck
lab_data = append('pco2', data, lab_data, ['pCO2'])

In [None]:
# Oxygen Partial Pressure = Sauerstoffpartialdruck
lab_data = append('po2', data, lab_data, ['pO2'])

In [None]:
# Immature Granulocytes = Unreife Granulozyten
lab_data = append('ig', data, lab_data, ['unreife Granulozyten absolut'])

ig = data[(data['type'] == 'unreife Granulozyten') & (data['unit'] == '/nl')]
ig['type'] = 'ig'
lab_data = pd.concat([lab_data, ig], ignore_index=True)

In [None]:
# Immature Granulocytes Relative = Unreife Granulozyten Relativ
lab_data = append('ig_rel', data, lab_data, ['unreife Granulozyten %'])

ig_rel = data[(data['type'] == 'unreife Granulozyten') & (data['unit'] == '%')]
ig_rel['type'] = 'immature_granulocytes_relative'
lab_data = pd.concat([lab_data, ig_rel], ignore_index=True)

# Clean

In [None]:
lab_data['value'] = lab_data['value'].apply(pd.to_numeric, errors='coerce')
lab_data['value'] = lab_data['value'].astype(float)

In [None]:
# case_id
lab_data['case'] = lab_data['case'].astype(int)

# event_ts
lab_data['timestamp'] = pd.to_datetime(lab_data['timestamp'])

# type
lab_data['type'] = lab_data['type'].astype(str)

# drop unit column
lab_data.drop(columns=['unit'], inplace=True)

# drop nan
lab_data.dropna(inplace=True)

# drop duplicates
lab_data.drop_duplicates(keep = 'first', inplace = True)

# sort by event_ts
lab_data.sort_values(by=['timestamp'], inplace=True)

lab_data

In [None]:
# import 
included_cases = pd.read_csv('../4_cases/4_3_clean.csv')

# drop all lab_data rows that are not in included_cases and print the number of deleted rows
print('Length of lab data before: ' + str(len(lab_data)))
print('Number of unique cases in lab data before: ' + str(lab_data['case'].nunique()))
lab_data = lab_data[lab_data['case'].isin(included_cases['case'])]
print('Length of lab data after: ' + str(len(lab_data)))
print('Number of unique cases in lab data after: ' + str(lab_data['case'].nunique()))

lab_data

# Save

In [None]:
lab_data.to_csv('8_3_clean.csv', index=False)
lab_data

In [None]:
types = lab_data.groupby('type').size()
types.to_csv(f'8_4_types.csv', index = True)
types