# Import

In [None]:
import pandas as pd
import numpy as np

# Setup

In [None]:
data = pd.DataFrame()

In [None]:
raw = pd.read_csv('1_raw/1_data.csv')
raw

In [None]:
system_data = pd.read_csv('1_raw/2_systems.csv')

for index, row in raw.iterrows():
    case = row['case']

    systems: str = ''
    system_count = np.NaN
    if len(system_data[system_data['case'] == case]) > 0:
        systems = system_data[system_data['case'] == case].iloc[0]['systems']
        system_count = systems.count(',') + 1

    raw.at[index, 'surgery_system_count'] = system_count

    raw.at[index, 'surgery_system_esophagus'] = 1 if 'Esophagus' in systems else 0
    raw.at[index, 'surgery_system_stomach'] = 1 if 'Magen' in systems else 0
    raw.at[index, 'surgery_system_intestine'] = 1 if 'Kolorektal' in systems else 0
    raw.at[index, 'surgery_system_liver'] = 1 if 'Leber' in systems else 0
    raw.at[index, 'surgery_system_pancreas'] = 1 if 'Pankreas' in systems else 0

raw

# Logic

In [None]:
def numeric(name: str, raw_name: str, min: float = np.NaN, max: float = np.NaN):
    unique = raw[raw_name].unique()
    for value in unique:
        try: pd.to_numeric(value)
        except: unique = unique[unique != value]
    print('Non Numeric unique Values:', raw[raw_name][~raw[raw_name].isin(unique)].unique())

    raw[raw_name] = pd.to_numeric(raw[raw_name], errors='coerce')
    
    if not np.isnan(min): 
        print('Values smaller than min:', raw[raw_name][raw[raw_name] < min].count())
        # raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x < min else x)
        raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x < min else x)
    if not np.isnan(max): 
        print('Values greater than max:', raw[raw_name][raw[raw_name] > max].count())
        # raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x > max else x)
        raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x > max else x)

    o = raw[raw_name]
    o_data = raw[raw['center'] == 0][raw_name]
    o_validation = raw[raw['center'] == 1][raw_name]
    print(f'All: {round(o.count() / len(o) * 100, 2)} % (Mean: {round(o.mean(), 2)}, Std: { round(o.std(), 2) }, Min: { round(o.min(), 2) }, Max: { round(o.max(), 2) })')
    print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Mean: {round(o_data.mean(), 2)}, Std: { round(o_data.std(), 2) }, Min: { round(o_data.min(), 2) }, Max: { round(o_data.max(), 2) })')
    print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Mean: {round(o_validation.mean(), 2)}, Std: { round(o_validation.std(), 2) }, Min: { round(o_validation.min(), 2) }, Max: { round(o_validation.max(), 2) })')

    data[name] = o.astype(float)

In [None]:
def categorial_numeric_encoding(name: str, raw_name: str, not_supported: list = []):
    raw[raw_name].replace(not_supported, np.NaN, inplace=True)

    unique = raw[raw_name].unique()
    unique = unique[~pd.isna(unique)]

    try: unique = sorted(unique)
    except TypeError: pass

    for i, value in enumerate(unique):
        raw[raw_name] = raw[raw_name].apply(lambda x: i if x == value else x)
        print(i, value, round((raw[raw_name] == i).sum() / len(raw[raw_name]) * 100, 2), '%')

    o = raw[raw_name]
    o_data = raw[raw['center'] == 0][raw_name]
    o_validation = raw[raw['center'] == 1][raw_name]
    print(f'All: {round(o.count() / len(o) * 100, 2)} % (Mean: {round(o.mean(), 2)}, Std: { round(o.std(), 2) }, Min: { round(o.min(), 2) }, Max: { round(o.max(), 2) })')
    print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Mean: {round(o_data.mean(), 2)}, Std: { round(o_data.std(), 2) }, Min: { round(o_data.min(), 2) }, Max: { round(o_data.max(), 2) })')
    print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Mean: {round(o_validation.mean(), 2)}, Std: { round(o_validation.std(), 2) }, Min: { round(o_validation.min(), 2) }, Max: { round(o_validation.max(), 2) })')
    
    data[name] = o.astype('Int64')

In [None]:
def categorial_one_hot_encoding(name: str, raw_name: str, mapping: dict = {}, not_supported: list = []):
    raw[raw_name].replace(mapping, inplace=True)
    raw[raw_name].replace(not_supported, np.NaN, inplace=True)

    unique = raw[raw_name].unique()
    unique = unique[~pd.isna(unique)]

    try: unique = sorted(unique)
    except TypeError: pass
    
    for value in unique:
        print(value, round((raw[raw_name] == value).sum() / len(raw[raw_name]) * 100, 2), '%')
    
    o = raw[raw_name]
    o_data = raw[raw['center'] == 0][raw_name]
    o_validation = raw[raw['center'] == 1][raw_name]
    print(f'All: {round(o.count() / len(o) * 100, 2)} %')
    print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} %')
    print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} %')
    
    data[name] = o.astype(str)

In [None]:
def timestamp(name: str, raw_name: str):
     raw[raw_name]= pd.to_datetime(raw[raw_name], errors='coerce')

     o = raw[raw_name]
     o_data = raw[raw['center'] == 0][raw_name]
     o_validation = raw[raw['center'] == 1][raw_name]
     print(f'All: {round(o.count() / len(o) * 100, 2)} % (Min: { o.min() }, Max: { o.max() })')
     print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Min: { o_data.min() }, Max: { o_data.max() })')
     print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Min: { o_validation.min() }, Max: { o_validation.max() })')

     data[name] = o

# Populate

In [None]:
# Identifier: Cohort
raw['center'].replace({ 'CVK': 0, 'CCM': 0, 'CBF': 1 }, inplace=True)
categorial_numeric_encoding('identifier_cohort', raw_name='center')

## Meta

In [None]:
# Case
numeric('meta_case', raw_name='case', min=0)

In [None]:
# Patient
numeric('meta_patient', raw_name='patient', min=0)

In [None]:
# Incision
timestamp('meta_incision', raw_name='op_schnitt')

In [None]:
# Suture
timestamp('meta_suture', raw_name='op_naht')

In [None]:
# Year
raw['meta_year'] = data['meta_incision'].copy().dt.year
categorial_numeric_encoding('meta_year', raw_name='meta_year', not_supported=[2023])

In [None]:
# Meta: System
raw['meta_system'] = raw['organ'].replace({ 'Esophagus': 0, 'Gastric': 1, 'Colorectal': 2, 'Small_intestine': 2, 'Liver': 3, 'Pancreas': 4 })
categorial_numeric_encoding('meta_system', raw_name='meta_system', not_supported=['Other_organ_system', 'Transplant'])

In [None]:
# Meta: OPS
numeric('meta_ops', raw_name='is_majorop')

In [None]:
# Meta: Age
numeric('meta_age', raw_name='ageatsurgery')

In [None]:
# Meta: Urgency
categorial_numeric_encoding('meta_urgency', raw_name='urgency', not_supported=['keine'])

In [None]:
# Meta: Resurgery
categorial_numeric_encoding('meta_resurgery', raw_name='previous_op')

## Feature

### General

In [None]:
# Gender
raw['patient_sex'].replace({ 'male': 0, 'female': 1 }, inplace=True)
categorial_numeric_encoding('gender', raw_name='patient_sex')

In [None]:
# Age
numeric('age', raw_name='ageatsurgery', min=18, max=120)

In [None]:
# Height
numeric('height', raw_name='height_in_cm', min=100, max=250)

In [None]:
# Weight
numeric('weight', raw_name='pre_op_weight_in_kg', min=25, max=300)

In [None]:
# BMI
numeric('bmi', raw_name='bmi', min=5, max=100)

In [None]:
# ASA
categorial_numeric_encoding('asa', raw_name='asa_classification')

In [None]:
# ECOG
categorial_numeric_encoding('ecog', raw_name='ecog')

In [None]:
# CCI
categorial_numeric_encoding('cci', raw_name='cci')

### Condition

In [None]:
# Myocardial Infarction
categorial_numeric_encoding('myocardial_infarction', raw_name='myocardial_infarction')

In [None]:
# Congestive Heart Failure
categorial_numeric_encoding('congestive_heart_failure', raw_name='congestive_heart_failure')

In [None]:
# Peripheral Vascular Disease
categorial_numeric_encoding('peripheral_vascular_disease', raw_name='peripheral_vascular_disease')

In [None]:
# Cerebrovascular Disease
categorial_numeric_encoding('cerebrovascular_disease', raw_name='cerebrovascular_disease')

In [None]:
# Dementia
categorial_numeric_encoding('dementia', raw_name='dementia')

In [None]:
# Chronic Pulmonary Disease
categorial_numeric_encoding('chronic_pulmonary_disease', raw_name='chronic_pulmonary_disease')

In [None]:
# Rheumatic Disease
categorial_numeric_encoding('rheumatic_disease', raw_name='rheumathic_disease')

In [None]:
# Peptic Ulcer Disease
categorial_numeric_encoding('peptic_ulcer_disease', raw_name='peptic_ulcer_disease')

In [None]:
# Liver Disease (Mild)
categorial_numeric_encoding('liver_disease_mild', raw_name='mild_liver_disease')

In [None]:
# Liver Disease (Moderate to Severe)
categorial_numeric_encoding('liver_disease_moderate_to_severe', raw_name='moderate_or_severe_liver_disease')

In [None]:
# Diabetes (without Chronic Complications)
categorial_numeric_encoding('diabetes_without_chronic_complications', raw_name='diabetes_without_chronic_complications')

In [None]:
# Diabetes (with Chronic Complications)
categorial_numeric_encoding('diabetes_with_chronic_complications', raw_name='diabetes_with_chronic_complications')

In [None]:
# Hemiplegia or Paraplegia
categorial_numeric_encoding('hemiplegia_or_paraplegia', raw_name='hemiplegia_or_paraplegia')

In [None]:
# Renal Disease
categorial_numeric_encoding('renal_disease', raw_name='renal_disease')

In [None]:
# Malignancy
categorial_numeric_encoding('malignancy', raw_name='any_malignancy')

In [None]:
# Metastatic Solid Tumor
categorial_numeric_encoding('metastatic_solid_tumor', raw_name='metastatic_solid_tumor')

In [None]:
# AIDS
categorial_numeric_encoding('aids', raw_name='aids_hiv')

In [None]:
# Cardiac Arythmia
categorial_numeric_encoding('cardiac_arythmia', raw_name='cardiac_arrythmias')

In [None]:
# Valvular Disease
categorial_numeric_encoding('valvular_disease', raw_name='valvular_disease')

In [None]:
# Pulmonary Circulatory Disorder
categorial_numeric_encoding('pulmonary_circulatory_disorder', raw_name='pulmonary_circulation_disorder')

In [None]:
# Arterial Hypertension
categorial_numeric_encoding('arterial_hypertension', raw_name='arterial_hypertension')

In [None]:
# Other Neurological Disorders
categorial_numeric_encoding('other_neurological_disorders', raw_name='other_neurological_disorders')

In [None]:
# Hypothyroidism
categorial_numeric_encoding('hypothyroidism', raw_name='hypothyroidism')

In [None]:
# Coagulopathy
categorial_numeric_encoding('coagulopathy', raw_name='coagulopathy')

In [None]:
# Obesity
categorial_numeric_encoding('obesity', raw_name='obesity')

In [None]:
# Weight Loss
categorial_numeric_encoding('weight_loss', raw_name='weight_loss')

In [None]:
# Fluid and Electrolyte Disorders
categorial_numeric_encoding('fluid_and_electrolyte_disorders', raw_name='fluid_and_electrolyte_disorders')

In [None]:
# Blood Loss Anemia
categorial_numeric_encoding('blood_loss_anemia', raw_name='blood_loss_anemia')

In [None]:
# Deficiency Anemia
categorial_numeric_encoding('deficiency_anemia', raw_name='deficiency_anemia')

In [None]:
# Alcohol Abuse
categorial_numeric_encoding('alcohol_abuse', raw_name='alcohol_abuse')

In [None]:
# Drug Abuse
categorial_numeric_encoding('drug_abuse', raw_name='drug_abuse')

In [None]:
# Psychoses
categorial_numeric_encoding('psychoses', raw_name='psychoses')

In [None]:
# Depression
categorial_numeric_encoding('depression', raw_name='depression')

In [None]:
# Coronary Heart Disease
categorial_numeric_encoding('coronary_heart_disease', raw_name='khk')

In [None]:
# Chronic Pancreatitis
categorial_numeric_encoding('chronic_pancreatitis', raw_name='chronic_pancreatitis')

### Surgery

In [None]:
# Primary System
categorial_one_hot_encoding('primary_system', raw_name='organ', mapping={'Esophagus': 'esophagus', 'Gastric': 'stomach', 'Colorectal': 'intestine', 'Small_intestine': 'intestine', 'Liver': 'liver', 'Pancreas': 'pancreas'}, not_supported=['Other_organ_system', 'Transplant'])

In [None]:
# System Esophagus
categorial_numeric_encoding('system_esophagus', raw_name='surgery_system_esophagus')

In [None]:
# System Stomach
categorial_numeric_encoding('system_stomach', raw_name='surgery_system_stomach')

In [None]:
# System Intestine
categorial_numeric_encoding('system_intestine', raw_name='surgery_system_intestine')

In [None]:
# System Liver
categorial_numeric_encoding('system_liver', raw_name='surgery_system_liver')

In [None]:
# System Pancreas
categorial_numeric_encoding('system_pancreas', raw_name='surgery_system_pancreas')

In [None]:
# System Count
numeric('system_count', raw_name='surgery_system_count', min=1, max=5)

In [None]:
# Urgency
categorial_numeric_encoding('urgency', raw_name='urgency')

In [None]:
# Resurgery
categorial_numeric_encoding('resurgery', raw_name='previous_op')

In [None]:
# Month
raw['surgery_month'] = data['meta_incision'].copy().dt.month
categorial_numeric_encoding('month', raw_name='surgery_month')

In [None]:
# Weekday
raw['surgery_weekday'] = data['meta_incision'].copy().dt.weekday
categorial_numeric_encoding('weekday', raw_name='surgery_weekday')

In [None]:
# Daytime
raw['surgery_daytime'] = data['meta_incision'].copy().dt.hour
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 0 if (x >= 0 and x < 6) else x)
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 1 if (x >= 6 and x < 14) else x)
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 2 if (x >= 14 and x < 22) else x)
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 0 if (x >= 22 and x < 24) else x)
categorial_numeric_encoding('daytime', raw_name='surgery_daytime')

In [None]:
# Hour
raw['surgery_hour'] = data['meta_incision'].copy().dt.hour
categorial_numeric_encoding('hour', raw_name='surgery_hour')

In [None]:
# Mean Monthly Temperature
numeric('mean_monthly_temperature', raw_name='mean_monthly_temperature[°c]')

In [None]:
# Minimum Monthly Temperature
numeric('minimum_monthly_temperature', raw_name='min_monthly_temperature[°c]')

In [None]:
# Maximum Monthly Temperature
numeric('maximum_monthly_temperature', raw_name='max_monthly_temperature[°c]')

In [None]:
# Monthly Precipitation
numeric('monthly_precipitation', raw_name='mean_monthly_rainfall[l/m2]')

In [None]:
# Monthly Sunshine Hours
numeric('monthly_sunshine_hours', raw_name='sunshine[h]')

### Lab

In [None]:
# Sodium
numeric('sodium', raw_name='sodium', min=50, max=200)

In [None]:
# Potassium
numeric('potassium', raw_name='potassium', min=1, max=10)

In [None]:
# Albumin
numeric('albumin', raw_name='albumin', min=0, max=100)

In [None]:
# Bilirubin (Direct)
numeric('bilirubin_direct', raw_name='bilirubin_direct', min=0, max=50)

In [None]:
# Bilirubin (Indirect)
numeric('bilirubin_indirect', raw_name='bilirubin_indirect', min=0, max=50)

In [None]:
# Bilirubin
numeric('bilirubin', raw_name='bilirubin', min=0, max=50)

In [None]:
# Urea
numeric('urea', raw_name='urea', min=0, max=400)

In [None]:
# TSH
numeric('tsh', raw_name='tsh', min=0, max=50)

In [None]:
# ALT
numeric('alt', raw_name='alt', min=0, max=100000)

In [None]:
# AST
numeric('ast', raw_name='ast', min=0, max=100000)

In [None]:
# gGT
numeric('ggt', raw_name='ggt', min=0, max=100000)

In [None]:
# Lipase
numeric('lipase', raw_name='lipase', min=0, max=50000)

In [None]:
# CRP
numeric('crp', raw_name='crp', min=0, max=1000)

In [None]:
# Lactate
numeric('lactate', raw_name='lactate', min=0, max=200)

In [None]:
# HbA1C
numeric('hba1c', raw_name='hba1c', min=0, max=100)

In [None]:
# Hemoglobin
numeric('hemoglobin', raw_name='hemoglobin', min=0, max=40)

In [None]:
# WBC
numeric('wbc', raw_name='wbc', min=0, max=100)

In [None]:
# Platelets
numeric('platelets', raw_name='platelets', min=0, max=16000000)

In [None]:
# Hematocrit
numeric('hematocrit', raw_name='hematocrit', min=0, max=100)

In [None]:
# Prothrombin Time
numeric('prothrombin_time', raw_name='prothrombin_time', min=0, max=200)

In [None]:
# INR
numeric('inr', raw_name='inr', min=0, max=10)

In [None]:
# aPTT
numeric('aptt', raw_name='aptt', min=0, max=500)

In [None]:
# Erythrocytes
numeric('erythrocytes', raw_name='erythrocytes', min=0, max=10)

In [None]:
# Creatinine
numeric('creatinine', raw_name='creatinine', min=0, max=100)

In [None]:
# Glucose
numeric('glucose', raw_name='glucose', min=0, max=1000)

In [None]:
# ABG: pH
numeric('abg_ph', raw_name='abg_ph', min=0, max=10)

In [None]:
# ABG: Base Excess
numeric('abg_base_excess', raw_name='base_excess', min=-100, max=100)

In [None]:
# ABG: Bicarbonate
numeric('abg_bicarbonate', raw_name='abg_bicarbonate', min=0, max=50)

## Target

In [None]:
# Target: 30 Day Mortality
categorial_numeric_encoding('target_30_day_mortality', raw_name='30_day_mortality')

In [None]:
# Target: 90 Day Mortality
categorial_numeric_encoding('target_90_day_mortality', raw_name='90_day_mortality')

In [None]:
# Target: Clavien Dindo 5
categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')

In [None]:
# Target: Deceased after Discharge
categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')

In [None]:
# Target: Acute Myocardial Infarction
categorial_numeric_encoding('target_acute_myocardial_infarction', raw_name='acute_myocardial_infarction')

In [None]:
# Target: Pulmonary Embolism
categorial_numeric_encoding('target_pulmonary embolism', raw_name='pulmonary_embolism')

In [None]:
# Target: Septic Shock
categorial_numeric_encoding('target_septic_shock', raw_name='septic_shock')

In [None]:
# Target: Pneumonia
categorial_numeric_encoding('target_pneumonia', raw_name='pneumonia')

In [None]:
# Target: Liver Failure
categorial_numeric_encoding('target_liver_failure', raw_name='liver_failure')

In [None]:
# Target: Cardiogenic Shock
categorial_numeric_encoding('target_cardiogenic_shock', raw_name='cardiogenic_shock')

In [None]:
# Target: Acute Pancreatitis
categorial_numeric_encoding('target_acute_pancreatitis', raw_name='acute_pancreatitis')

# Process

In [None]:
# Sort
data.sort_values(by=['meta_incision'], inplace=True)
data

In [None]:
# Duplicates
data.drop_duplicates(subset='meta_case', inplace=True)
data

## Inclusion Criteria

In [None]:
# Year
data = data[data['meta_year'].notna()]
data

In [None]:
# System
data = data[data['meta_system'].notna()]
data

In [None]:
# OPS
data = data[data['meta_ops'] == 1]
data

In [None]:
# Age
data = data[data['meta_age'] >= 18]
data

## Column Management

In [None]:
center0 = data[data['identifier_cohort'] == 0]
for column in center0.columns:
    if column == 'identifier_cohort': continue
    if center0[column].nunique() <= 1:
        data.drop(column, axis=1, inplace=True)
        print(column)

data

In [None]:
completeness = data.count() / len(data) * 100
completeness = completeness.apply(lambda x: round(x / 10) * 10)
completeness = completeness.value_counts().sort_index()
completeness.plot.bar()

In [None]:
center0 = data[data['identifier_cohort'] == 0]
for column in center0.columns:
    if center0[column].isna().sum() / len(center0) > 0.5:
        data.drop(column, axis=1, inplace=True)
        print(column)

data

# Save

In [None]:
data.to_csv('3_clean.csv', index = False)
data

In [None]:
description: pd.DataFrame = pd.DataFrame()

i = 0
for column in data.columns:
    data_total = data[column]
    data_training = data[data['identifier_cohort'] == 0][column]
    data_validation = data[data['identifier_cohort'] == 1][column]

    description = description.append({
        'id': i,
        'name': column,
        'dimension': '',
        'lower_limit': '',
        'upper_limit': '',
        'type': data_total.dtype,
        'count': data_total.count(),
        'count (0)': data_training.count(),
        'count (1)': data_validation.count(),
        'completeness': round(data_total.count() / len(data_total) * 100, 2),
        'completeness (0)': round(data_training.count() / len(data_training) * 100, 2),
        'completeness (1)': round(data_validation.count() / len(data_validation) * 100, 2),
        'min': data_total.min(),
        'min (0)': data_training.min(),
        'min (1)': data_validation.min(),
        'max': data_total.max(),
        'max (0)': data_training.max(),
        'max (1)': data_validation.max(),
        'unique': data_total.nunique(),
        'unique (0)': data_training.nunique(),
        'unique (1)': data_validation.nunique(),
    }, ignore_index=True)

    i += 1

description.to_csv('4_description.csv', index=False)
description

In [None]:
print(description.to_markdown(index=False))