# Import

In [None]:
import pandas as pd

# Logic

In [None]:
def numeric(series: pd.Series) -> pd.Series:
    series = series.astype(pd.Int64Dtype())

    print('Completeness:', round(series.count() / len(series) * 100, 2), '%')
    print(series.describe())

    return series.astype('Int64')

In [None]:
def categorial_numeric_encoding(series: pd.Series) -> pd.Series:
    unique = series.unique()
    unique = unique[~pd.isna(unique)]

    try: unique = sorted(unique)
    except TypeError: pass

    print('Completeness:', round(series.count() / len(series) * 100, 2), '%')

    i = 0
    for value in unique:
        series = series.replace(value, i)
        print(i, value, round((series == i).sum() / len(series) * 100, 2), '%')

        i += 1

    return series.astype('Int64')

In [None]:
def categorial_one_hot_encoding(data: pd.DataFrame, series: pd.Series) -> pd.DataFrame:
    unique = series.unique()
    unique = unique[~pd.isna(unique)]

    try: unique = sorted(unique)
    except TypeError: pass

    print('Completeness:', round(series.count() / len(series) * 100, 2), '%')

    for value in unique:
        data[f'{series.name}_{value}'] = (series == value).astype('Int64')

    return data

In [None]:
def timestamp(series: pd.Series) -> pd.Series:
    o = pd.to_datetime(series.copy(), format='%Y%m%d%H%M%S')

    print(f"Completeness: {o.notnull().mean():.2%}")

    return o

# Load

In [None]:
general_data = pd.DataFrame()

In [None]:
data = pd.read_excel('5_1_raw.xlsx')
data

# Clean

## Meta

In [None]:
# meta_case_id
general_data['meta_case'] = numeric(data['Fallnummer'])

In [None]:
# meta_patient_id
general_data['meta_patient'] = numeric(data['PatientenID'])

In [None]:
# meta_surgery_id
general_data['meta_surgery'] = numeric(data['OPNummer'])

In [None]:
# meta_year
year_data = data['OP_Schnitt'].copy().dt.year
general_data['meta_year'] = categorial_numeric_encoding(year_data)

In [None]:
# meta_system
systemData = data['Organsysteme'].copy().replace({ 'Esophagus': 0, 'Magen': 1, 'Kolorektal': 2, 'Leber': 3, 'Pankreas': 4, 'Transplant': pd.NA })
general_data['meta_system'] = categorial_numeric_encoding(systemData)

In [None]:
# meta_ops
ops = pd.read_csv('../3_ops/3_1_ops.csv')
ops_data = data['OPSCode'].copy().apply(lambda x: x[0 : 5])

for i in ops_data.unique():
    if i in ops['ops'].unique():
        ops_data[ops_data == i] = ops[ops['ops'] == i].index[0]
    else:
        ops_data[ops_data == i] = pd.NA

general_data['meta_ops'] = numeric(ops_data)

In [None]:
# meta_campus
campus_data = data['Campus'].copy().replace({ 'CCM': 0, 'CVK': 1, 'CBF': pd.NA })
general_data['meta_campus'] = categorial_numeric_encoding(campus_data)

In [None]:
# meta_admission_ts
general_data['meta_admission_ts'] = timestamp(data['Aufnahme'])

In [None]:
# meta_discharge_ts
general_data['meta_discharge_ts'] = timestamp(data['Entlassung'])

In [None]:
# meta_incision_ts
general_data['meta_incision_ts'] = timestamp(data['OP_Schnitt'])

In [None]:
# meta_suture_ts
general_data['meta_suture_ts'] = timestamp(data['OP_Naht'])

In [None]:
# meta_icu_admission_ts
general_data['meta_icu_admission_ts'] = timestamp(data['Primary_ICU_Stay_from'])

In [None]:
# meta_icu_discharge_ts
general_data['meta_icu_discharge_ts'] = timestamp(data['Primary_ICU_Stay_until'])

In [None]:
# meta_follow_up_ts
general_data['meta_follow_up_ts'] = timestamp(data['FollowUp_Date'])

## Features

### General

In [None]:
# gender
general_data['gender'] = categorial_numeric_encoding(data['Geschlecht'])

In [None]:
# age
general_data['age'] = numeric(data['AgetAtSurgery'])

In [None]:
# height
general_data['height'] = numeric(data['Groesse'])

In [None]:
# weight
general_data['weight'] = numeric(data['Gewicht'])

In [None]:
# bmi
general_data['bmi'] = numeric(data['BMI'])

In [None]:
# asa
general_data['asa'] = numeric(data['ASA'])

In [None]:
# jones
general_data['jones'] = numeric(data['Jones'])

In [None]:
# ecog
general_data['ecog'] = numeric(data['ECOG'])

In [None]:
# functional_status
general_data['functional_status'] = categorial_numeric_encoding(data['Functional_status'])

In [None]:
# charlson_comorbidity_score
general_data['charlson_comorbidity_score'] = numeric(data['CharlsonComorbidityScore'])

### Conditions

In [None]:
# condition_myocardial_infarction
general_data['condition_myocardial_infarction'] = categorial_numeric_encoding(data['Myocardial_infarction'])

In [None]:
# condition_congestive_heart_failure
general_data['condition_congestive_heart_failure'] = categorial_numeric_encoding(data['Congestive_heart_failure'])

In [None]:
# condition_peripheral_vascular_disease
general_data['condition_peripheral_vascular_disease'] = categorial_numeric_encoding(data['Peripheral_vascular_disease'])

In [None]:
# condition_cerebrovascular_disease
general_data['condition_cerebrovascular_disease'] = categorial_numeric_encoding(data['Cerebrovascular_disease'])

In [None]:
# condition_dementia
general_data['condition_dementia'] = categorial_numeric_encoding(data['Dementia'])

In [None]:
# condition_chronic_pulmonary_disease
general_data['condition_chronic_pulmonary_disease'] = categorial_numeric_encoding(data['Chronic_pulmonary_disease'])

In [None]:
# condition_rheumatic_disease
general_data['condition_rheumatic_disease'] = categorial_numeric_encoding(data['Rheumatic_disease'])

In [None]:
# condition_peptic_ulcer_disease
general_data['condition_peptic_ulcer_disease'] = categorial_numeric_encoding(data['Peptic_ulcer_disease'])

In [None]:
# condition_liver_disease_mild
liver_disease_mild = data['Liver_disease_mild']
general_data['condition_liver_disease_mild'] = categorial_numeric_encoding(liver_disease_mild)

In [None]:
# condition_liver_disease_moderate_to_severe
liver_disease_moderate_to_severe = data['Liver_disease_moderate_to_severe']
general_data['condition_liver_disease_moderate_to_severe'] = categorial_numeric_encoding(liver_disease_moderate_to_severe)

In [None]:
# condition_diabetes_without_chronic_complications
diabetes_without_chronic_complications = data['Diabetes_without_chronic_complications']
general_data['condition_diabetes_without_chronic_complications'] = categorial_numeric_encoding(diabetes_without_chronic_complications)

In [None]:
# condition_diabetes_with_chronic_complications
diabetes_with_chronic_complications = data['Diabetes_with_chronic_complications']
general_data['condition_diabetes_with_chronic_complications'] = categorial_numeric_encoding(diabetes_with_chronic_complications)

In [None]:
# condition_renal_disease_mild_to_moderate
renal_disease_mild_to_moderate = data['Renal_disease_mild_to_moderate']
general_data['condition_renal_disease_mild_to_moderate'] = categorial_numeric_encoding(renal_disease_mild_to_moderate)

In [None]:
# condition_renal_disease_severe
renal_disease_severe = data['Renal_disease_severe']
general_data['condition_renal_disease_severe'] = categorial_numeric_encoding(renal_disease_severe)

In [None]:
# condition_hemiplegia_or_paraplegia
general_data['condition_hemiplegia_or_paraplegia'] = categorial_numeric_encoding(data['Hemiplegia_or_paraplegia'])

In [None]:
# condition_malignancy
general_data['condition_malignancy'] = categorial_numeric_encoding(data['Any_malignancy'])

In [None]:
# condition_metastatic_solid_tumor
general_data['condition_metastatic_solid_tumor'] = categorial_numeric_encoding(data['Metastatic_solid_tumor'])

In [None]:
# condition_hiv_without_aids
hiv_without_aids = data['HIV_infection_no_AIDS']
general_data['condition_hiv_without_aids'] = categorial_numeric_encoding(hiv_without_aids)

In [None]:
# condition_aids
aids = data['AIDS']
general_data['condition_aids'] = categorial_numeric_encoding(aids)

### Surgery

In [None]:
# surgery_year
general_data['surgery_year'] = general_data['meta_year'].copy()

In [None]:
# surgery_system
general_data = categorial_one_hot_encoding(general_data, general_data['meta_system'].copy().rename('surgery_system'))

In [None]:
# surgery_ops
general_data = categorial_one_hot_encoding(general_data, general_data['meta_ops'].copy().rename('surgery_ops'))

In [None]:
# surgery_urgency
urgency_data = data['Dringlichkeit'].copy().replace({ 'N0': 0, 'N1': 1, 'N2': 2, 'N3': 3, 'N4': 4, 'N5': 5, 'keine': pd.NA, '<Error>': pd.NA })
general_data['surgery_urgency'] = categorial_numeric_encoding(urgency_data)

In [None]:
# surgery_procedure
# general_data['surgery_procedure'] = categorial_numeric_encoding(data['Prozedur'])

In [None]:
# surgery_approach
print('Completeness: ' + str(data['Approach'].notna().sum() / len(data['Approach'])))
approach_data = data['Approach'].copy().replace({ 'offen': 'open', 'konvertiert laparoskopisch-offen': 'converted', 'minimalinvasiv': 'minimalinvasive' })
approach_data.name = 'surgery_approach'
general_data = categorial_one_hot_encoding(general_data, approach_data)

In [None]:
# surgery_pancreatic_resection
print('Completeness: ' + str(data['Art_Pankreasresektion'].notna().sum() / len(data['Art_Pankreasresektion'])))
pancreatic_resection_data = data['Art_Pankreasresektion'].copy().replace({ 'keine': pd.NA, 'Pankreadektomie': 'pancreatectomy', 'Kopfresektion': 'head', 'Linksresektion': 'left', 'Beger': 'beger' })
pancreatic_resection_data.name = 'surgery_pancreatic_resection'
general_data = categorial_one_hot_encoding(general_data, pancreatic_resection_data)

In [None]:
# surgery_pancreatic_reconstruction
general_data['surgery_pancreatic_reconstruction'] = categorial_numeric_encoding(data['Pankreasreko'])

In [None]:
# surgery_liver_resection
print('Completeness: ' + str(data['Art-Leberresektion'].notna().sum() / len(data['Art-Leberresektion'])))
liver_resection_data = data['Art-Leberresektion'].copy().replace({ 'keine': pd.NA, 'Minor': 'minor', 'Major': 'major' })
liver_resection_data.name = 'surgery_liver_resection'
general_data = categorial_one_hot_encoding(general_data, liver_resection_data)

In [None]:
# surgery_esophageal_resection
print('Completeness: ' + str(data['Art_Ösophagusresektion'].notna().sum() / len(data['Art_Ösophagusresektion'])))
esophageal_resection_data = data['Art_Ösophagusresektion'].copy().replace({ 'keine': pd.NA, 'totale': 'total', 'abdominothorakale': 'abdominothorakale', 'Transhiatal erweiterte totale Gastrektomie + distale Ösophagusresektion': 'transhiatal_extended' })
esophageal_resection_data.name = 'surgery_esophageal_resection'
general_data = categorial_one_hot_encoding(general_data, esophageal_resection_data)

In [None]:
# surgery_esophageal_anastomosis
print('Completeness: ' + str(data['Art_Ösophagusanastomose'].notna().sum() / len(data['Art_Ösophagusanastomose'])))
esophageal_anastomosis_data = data['Art_Ösophagusanastomose'].copy().replace({ 'keine': pd.NA, 'cervikal': 'cervical', 'thorakal': 'thoracic', 'transhiatal': 'transhiatal', 'Interponat': 'interponat', 'Keine': pd.NA })
esophageal_anastomosis_data.name = 'surgery_esophageal_anastomosis'
general_data = categorial_one_hot_encoding(general_data, esophageal_anastomosis_data)

In [None]:
# surgery_gastric_resection
print('Completeness: ' + str(data['Art_Magenresektion'].notna().sum() / len(data['Art_Magenresektion'])))
gastric_resection_data = data['Art_Magenresektion'].copy().replace({ 'keine': pd.NA, 'Partielle': 'partial', 'Atypische': 'atypical', 'Totale': 'total' })
gastric_resection_data.name = 'surgery_gastric_resection'
general_data = categorial_one_hot_encoding(general_data, gastric_resection_data)

In [None]:
# surgery_gastric_reconstruction
print('Completeness: ' + str(data['Art_Magenrekonstruktion'].notna().sum() / len(data['Art_Magenrekonstruktion'])))
gastric_reconstruction_data = data['Art_Magenrekonstruktion'].copy().replace({ 'keine': pd.NA, 'Gastrojejuno': 'gastrojejuno', 'Ösophagojejuno': 'esophagojejuno', 'Interponat': 'interponat', 'Ösophagogastro': 'esophagogastro', 'Gastroduodeno': 'gastroduodeno' })
gastric_reconstruction_data.name = 'surgery_gastric_reconstruction'
general_data = categorial_one_hot_encoding(general_data, gastric_reconstruction_data)

In [None]:
# surgery_small_intestine_resection
print('Completeness: ' + str(data['Art_Dünndarmresektion'].notna().sum() / len(data['Art_Dünndarmresektion'])))
small_intestine_resection_data = data['Art_Dünndarmresektion'].copy().replace({ 'keine': pd.NA, 'Ileum': 'ileum', 'Duodenum': 'duodenum', 'Jejunum': 'jejunum' })
small_intestine_resection_data.name = 'surgery_small_intestine_resection'
general_data = categorial_one_hot_encoding(general_data, small_intestine_resection_data)

In [None]:
# surgery_small_intestine_anastomosis
general_data['surgery_small_intestine_anastomosis'] = categorial_numeric_encoding(data['Dünndarmanastomose'])

In [None]:
# surgery_colon_resection
print('Completeness: ' + str(data['Art_Kolonresektion'].notna().sum() / len(data['Art_Kolonresektion'])))
colon_resection_data = data['Art_Kolonresektion'].copy().replace({ 'keine': pd.NA, 'Hemikolektomie_rechts': 'hemicolectomy_right', 'Hemikolektomie_links': 'hemicolectomy_left', 'Segmentresektion': 'segment_resection', 'Ileocoecalresektion': 'ileocoecal_resection', 'Sigmaresektion': 'sigmoid_resection', 'Zoecumresektion': 'zoecum_resection', 'Transversumresektion': 'transverse_resection', 'Kolektomie': 'colectomy', 'Proktokolektomie': 'proctocolectomy' })
colon_resection_data.name = 'surgery_colon_resection'
general_data = categorial_one_hot_encoding(general_data, colon_resection_data)

In [None]:
# surgery_colon_anastomosis
general_data['surgery_colon_anastomosis'] = categorial_numeric_encoding(data['Kolonanastomose'])

In [None]:
# surgery_rectum_resection
print('Completeness: ' + str(data['Art_Rektumresektion'].notna().sum() / len(data['Art_Rektumresektion'])))
rectum_resection_data = data['Art_Rektumresektion'].copy().replace({ 'keine': pd.NA, 'TME': 'tme', 'TaTME': 'tatme', 'Rektumamputation': 'amputation' })
rectum_resection_data.name = 'surgery_rectum_resection'
general_data = categorial_one_hot_encoding(general_data, rectum_resection_data)

In [None]:
# surgery_rectum_anastomosis
general_data['surgery_rectum_anastomosis'] = categorial_numeric_encoding(data['Rektumanastomose'])

In [None]:
# surgery_stoma_relocation
general_data['surgery_stoma_relocation'] = categorial_numeric_encoding(data['Stomarueckverlagerung'])

In [None]:
# surgery_reconnection
general_data['surgery_reconnection'] = categorial_numeric_encoding(data['Wiederanschluss_OP'])

In [None]:
# surgery_liver_transplantation
print('Completeness: ' + str(data['Leber-Transplantation'].notna().sum() / len(data['Leber-Transplantation'])))
liver_transplant_data = data['Leber-Transplantation'].copy().replace({ 'keine': pd.NA, 'komplett': 'complete', 'split': 'split' })
liver_transplant_data.name = 'surgery_liver_transplantation'
general_data = categorial_one_hot_encoding(general_data, liver_transplant_data)

In [None]:
# surgery_kidney_transplantation
print('Completeness: ' + str(data['Nierentransplantation'].notna().sum() / len(data['Nierentransplantation'])))
kidney_transplant_data = data['Nierentransplantation'].copy().replace({ 'keine': pd.NA, 'lebend': 'alive', 'leichen': 'corpses' })
kidney_transplant_data.name = 'surgery_kidney_transplantation'
general_data = categorial_one_hot_encoding(general_data, kidney_transplant_data)

In [None]:
# surgery_pancreas_transplantation
general_data['surgery_pancreas_transplantation'] = categorial_numeric_encoding(data['Pankreastransplantation'])

In [None]:
# surgery_cholecystectomy
cholecystectomy = data['Cholezystektomie']
general_data['surgery_cholecystectomy'] = categorial_numeric_encoding(cholecystectomy)

In [None]:
# surgery_additional_cholecystectomy
additional_cholecystectomy = data['Cholezystektomie _im Rahmen']
general_data['surgery_additional_cholecystectomy'] = categorial_numeric_encoding(additional_cholecystectomy)

In [None]:
# surgery_lung_resection
general_data['surgery_lung_resection'] = categorial_numeric_encoding(data['Lungenresektion'])

In [None]:
# surgery_kidney_resection
general_data['surgery_kidney_resection'] = categorial_numeric_encoding(data['Niereresektion'])

In [None]:
# surgery_another_organs_or_delbuking
general_data['surgery_another_organs_or_delbuking'] = categorial_numeric_encoding(data['Andere Organe/Delbuking'])

In [None]:
# surgery_vascular_resection
general_data['surgery_vascular_resection'] = categorial_numeric_encoding(data['Gefäßresektion'])

In [None]:
# surgery_biliodigestive_anastomosis
general_data['surgery_biliodigestive_anastomosis'] = categorial_numeric_encoding(data['BDA'])

In [None]:
# surgery_splenectomy
general_data['surgery_splenectomy'] = categorial_numeric_encoding(data['Splenektomie'])

In [None]:
# surgery_simultaneous_hernia_management
general_data['surgery_simultaneous_hernia_management'] = categorial_numeric_encoding(data['Simultane Hernienversorgung'])

In [None]:
# surgery_simultaneous_ablation
general_data['surgery_simultaneous_ablation'] = categorial_numeric_encoding(data['Simultane Ablation'])

In [None]:
# surgery_stoma_facility
general_data['surgery_stoma_facility'] = categorial_numeric_encoding(data['Stoma-Anlage'])

In [None]:
# surgery_hipec
general_data['surgery_hipec'] = categorial_numeric_encoding(data['HIPAC'])

## Targets

In [None]:
# target_30_day_mortality
general_data['target_30_day_mortality'] = categorial_numeric_encoding(data['30d_mortality'])

In [None]:
# target_90_day_mortality
general_data['target_90_day_mortality'] = categorial_numeric_encoding(data['90d_mortality'])

In [None]:
# target_death_within_primary_stay
general_data['target_death_within_primary_stay'] = categorial_numeric_encoding(data['Death_within_primary_stay'])

In [None]:
# target_icu_readmission
general_data['target_icu_readmission'] = categorial_numeric_encoding(data['Readmission_ICU_at_all'])

In [None]:
# target_resurgery
general_data['target_resurgery'] = categorial_numeric_encoding(data['ReOP'])

In [None]:
# target_number_of_surgeries
general_data['target_number_of_surgeries'] = numeric(data['corr_Anzahl'])

In [None]:
# target_discharge_ts
general_data['target_discharge_ts'] = timestamp(data['Entlassung'])

In [None]:
# target_length_of_stay
general_data['target_length_of_stay'] = numeric(data['LOS'])

In [None]:
# target_number_of_icu_stays
general_data['target_number_of_icu_stays'] = numeric(data['N_ICU-stays'])

In [None]:
# target_nights_on_primary_icu_stay
general_data['target_nights_on_primary_icu_stay'] = numeric(data['Nights_on_primary_ICU_stay'])

In [None]:
# target_number_of_icu_readmissions
general_data['target_number_of_icu_readmissions'] = numeric(data['N_Readmissions_ICU'])

In [None]:
# target_surgery_duration
general_data['target_surgery_duration'] = numeric(data['OPDauer'])

# Check

In [None]:
# Replace -1 with pandas.NA
general_data = general_data.replace(-1, pd.NA)
general_data

In [None]:
# Check whether Column '<Meta> Case ID' contains duplicates.
general_data['meta_case'].duplicated().any()

In [None]:
# import 
included_cases = pd.read_csv('../4_cases/4_3_clean.csv')

# drop all masterData rows that are not in included_cases and print the number of deleted rows
print('Length of masterData before: ' + str(len(general_data)))
print('Number of cases before: ' + str(general_data['meta_case'].nunique()))
general_data = general_data[general_data['meta_case'].isin(included_cases['case'])]
print('Length of masterData after: ' + str(len(general_data)))
print('Number of cases after: ' + str(general_data['meta_case'].nunique()))

general_data

# Clean Export

In [None]:
general_data.to_csv('5_3_clean.csv', index = False)
general_data.describe().to_csv('5_4_clean_analysis.csv')
general_data

# Final Export

## Inclusion Criteria

In [None]:
# Year
general_data = general_data[general_data['meta_year'].notna()]
general_data

In [None]:
# System
general_data = general_data[general_data['meta_system'].notna()]
general_data

In [None]:
# OPS
general_data = general_data[general_data['meta_ops'].notna()]
general_data

## Exclusion Criteria

In [None]:
# Age
general_data = general_data[general_data['age'] >= 18]
general_data

In [None]:
# Urgency
general_data = general_data[general_data['surgery_urgency'] == 5]
general_data

In [None]:
# 30 Day Mortality
general_data = general_data[general_data['target_30_day_mortality'].notna()]
general_data

In [None]:
# 90 Day Mortality
general_data = general_data[general_data['target_90_day_mortality'].notna()]
general_data

## Export

In [None]:
general_data.to_csv('5_5_final.csv', index = False)
general_data.describe().to_csv('5_6_final_analysis.csv')
general_data