# Import

In [None]:
import pandas as pd
import numpy as np

# Setup

In [None]:
data = pd.DataFrame()

In [None]:
raw = pd.read_excel('1_raw.xlsx')
raw

In [None]:
system_data = pd.read_csv('2_systems.csv')

for index, row in raw.iterrows():
    case = row['case']

    systems: str = ''
    system_count = np.NaN
    if len(system_data[system_data['case'] == case]) > 0:
        systems = system_data[system_data['case'] == case].iloc[0]['systems']
        system_count = systems.count(',') + 1

    raw.at[index, 'surgery_system_count'] = system_count

    raw.at[index, 'surgery_system_esophagus'] = 1 if 'Esophagus' in systems else 0
    raw.at[index, 'surgery_system_stomach'] = 1 if 'Magen' in systems else 0
    raw.at[index, 'surgery_system_intestine'] = 1 if 'Kolorektal' in systems else 0
    raw.at[index, 'surgery_system_liver'] = 1 if 'Leber' in systems else 0
    raw.at[index, 'surgery_system_pancreas'] = 1 if 'Pankreas' in systems else 0

raw

# Logic

In [None]:
def numeric(name: str, raw_name: str, min: float = np.NaN, max: float = np.NaN):
    unique = raw[raw_name].unique()
    for value in unique:
        try: pd.to_numeric(value)
        except: unique = unique[unique != value]
    print('Non Numeric unique Values:', raw[raw_name][~raw[raw_name].isin(unique)].unique())

    raw[raw_name] = pd.to_numeric(raw[raw_name], errors='coerce')
    
    if not np.isnan(min): 
        print('Values smaller than min:', raw[raw_name][raw[raw_name] < min].count())
        # raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x < min else x)
        raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x < min else x)
    if not np.isnan(max): 
        print('Values greater than max:', raw[raw_name][raw[raw_name] > max].count())
        # raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x > max else x)
        raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x > max else x)

    o = raw[raw_name]
    o_data = raw[raw['center'] == 0][raw_name]
    o_validation = raw[raw['center'] == 1][raw_name]
    print(f'All: {round(o.count() / len(o) * 100, 2)} % (Mean: {round(o.mean(), 2)}, Std: { round(o.std(), 2) }, Min: { round(o.min(), 2) }, Max: { round(o.max(), 2) })')
    print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Mean: {round(o_data.mean(), 2)}, Std: { round(o_data.std(), 2) }, Min: { round(o_data.min(), 2) }, Max: { round(o_data.max(), 2) })')
    print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Mean: {round(o_validation.mean(), 2)}, Std: { round(o_validation.std(), 2) }, Min: { round(o_validation.min(), 2) }, Max: { round(o_validation.max(), 2) })')

    data[name] = o.astype(float)

In [None]:
def categorial_numeric_encoding(name: str, raw_name: str, not_supported: list = []):
    raw[raw_name].replace(not_supported, np.NaN, inplace=True)

    unique = raw[raw_name].unique()
    unique = unique[~pd.isna(unique)]

    try: unique = sorted(unique)
    except TypeError: pass

    for i, value in enumerate(unique):
        raw[raw_name] = raw[raw_name].apply(lambda x: i if x == value else x)
        print(i, value, round((raw[raw_name] == i).sum() / len(raw[raw_name]) * 100, 2), '%')

    o = raw[raw_name]
    o_data = raw[raw['center'] == 0][raw_name]
    o_validation = raw[raw['center'] == 1][raw_name]
    print(f'All: {round(o.count() / len(o) * 100, 2)} % (Mean: {round(o.mean(), 2)}, Std: { round(o.std(), 2) }, Min: { round(o.min(), 2) }, Max: { round(o.max(), 2) })')
    print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Mean: {round(o_data.mean(), 2)}, Std: { round(o_data.std(), 2) }, Min: { round(o_data.min(), 2) }, Max: { round(o_data.max(), 2) })')
    print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Mean: {round(o_validation.mean(), 2)}, Std: { round(o_validation.std(), 2) }, Min: { round(o_validation.min(), 2) }, Max: { round(o_validation.max(), 2) })')
    
    data[name] = o.astype('Int64')

In [None]:
def categorial_one_hot_encoding(name: str, raw_name: str, mapping: dict = {}, not_supported: list = []):
    raw[raw_name].replace(mapping, inplace=True)
    raw[raw_name].replace(not_supported, np.NaN, inplace=True)

    unique = raw[raw_name].unique()
    unique = unique[~pd.isna(unique)]

    try: unique = sorted(unique)
    except TypeError: pass
    
    for value in unique:
        print(value, round((raw[raw_name] == value).sum() / len(raw[raw_name]) * 100, 2), '%')
    
    o = raw[raw_name]
    o_data = raw[raw['center'] == 0][raw_name]
    o_validation = raw[raw['center'] == 1][raw_name]
    print(f'All: {round(o.count() / len(o) * 100, 2)} %')
    print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} %')
    print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} %')
    
    data[name] = o.astype(str)

In [None]:
def timestamp(name: str, raw_name: str):
     raw[raw_name]= pd.to_datetime(raw[raw_name], errors='coerce')

     o = raw[raw_name]
     o_data = raw[raw['center'] == 0][raw_name]
     o_validation = raw[raw['center'] == 1][raw_name]
     print(f'All: {round(o.count() / len(o) * 100, 2)} % (Min: { o.min() }, Max: { o.max() })')
     print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Min: { o_data.min() }, Max: { o_data.max() })')
     print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Min: { o_validation.min() }, Max: { o_validation.max() })')

     data[name] = o

# Populate

## Meta

In [None]:
# Center
raw['center'].replace({ 'CVK': 0, 'CCM': 0, 'CBF': 1 }, inplace=True)
categorial_numeric_encoding('meta_center', raw_name='center')

In [None]:
# Case
numeric('meta_case', raw_name='case', min=0)

In [None]:
# Patient
numeric('meta_patient', raw_name='patient', min=0)

In [None]:
# Incision
timestamp('meta_incision', raw_name='op_schnitt')

In [None]:
# Suture
# timestamp('meta_suture', raw_name='op_naht')

In [None]:
# Year
raw['meta_year'] = data['meta_incision'].copy().dt.year
categorial_numeric_encoding('meta_year', raw_name='meta_year', not_supported=[2023])

In [None]:
# Meta: System
raw['meta_system'] = raw['organ_system'].replace({ 'Esophagus': 0, 'Gastric': 1, 'Colorectal': 2, 'Small_intestine': 2, 'Liver': 3, 'Pancreas': 4 })
categorial_numeric_encoding('meta_system', raw_name='meta_system', not_supported=['Other_organ_system', 'Transplant'])

In [None]:
# Meta: OPS
numeric('meta_ops', raw_name='is_majorop')

In [None]:
# Meta: Age
numeric('meta_age', raw_name='ageatsurgery')

In [None]:
# Meta: Urgency
categorial_numeric_encoding('meta_urgency', raw_name='urgency', not_supported=['keine'])

In [None]:
# Meta: Follow Up
categorial_numeric_encoding('meta_follow_up', raw_name='previous_op')

## Feature

### General

In [None]:
# Gender
raw['patient_sex'].replace({ 'male': 0, 'female': 1 }, inplace=True)
categorial_numeric_encoding('gender', raw_name='patient_sex')

In [None]:
# Age
numeric('age', raw_name='ageatsurgery', min=18, max=120)

In [None]:
# Height
numeric('height', raw_name='height_in_cm', min=100, max=250)

In [None]:
# Weight
numeric('weight', raw_name='pre_op_weight_in_kg', min=25, max=300)

In [None]:
# BMI
numeric('bmi', raw_name='bmi', min=5, max=100)

In [None]:
# ASA
categorial_numeric_encoding('asa', raw_name='asa_classification')

In [None]:
# ECOG
# categorial_numeric_encoding('ecog', raw_name='ECOG')

In [None]:
# CCI
categorial_numeric_encoding('cci', raw_name='cci')

### Condition

In [None]:
# Condition: Myocardial Infarction
categorial_numeric_encoding('condition_myocardial_infarction', raw_name='myocardial_infarction')

In [None]:
# Condition: Congestive Heart Failure
categorial_numeric_encoding('condition_congestive_heart_failure', raw_name='congestive_heart_failure')

In [None]:
# Condition: Peripheral Vascular Disease
categorial_numeric_encoding('condition_peripheral_vascular_disease', raw_name='peripheral_vascular_disease')

In [None]:
# Condition: Cerebrovascular Disease
categorial_numeric_encoding('condition_cerebrovascular_disease', raw_name='cerebrovascular_disease')

In [None]:
# Condition: Dementia
categorial_numeric_encoding('condition_dementia', raw_name='dementia')

In [None]:
# Condition: Chronic Pulmonary Disease
categorial_numeric_encoding('condition_chronic_pulmonary_disease', raw_name='chronic_pulmonary_disease')

In [None]:
# Condition: Rheumatic Disease
categorial_numeric_encoding('condition_rheumatic_disease', raw_name='rheumathic_disease')

In [None]:
# Condition: Peptic Ulcer Disease
categorial_numeric_encoding('condition_peptic_ulcer_disease', raw_name='peptic_ulcer_disease')

In [None]:
# Condition: Liver Disease (Mild)
categorial_numeric_encoding('condition_liver_disease_mild', raw_name='mild_liver_disease')

In [None]:
# Condition: Liver Disease (Moderate to Severe)
categorial_numeric_encoding('condition_liver_disease_moderate_to_severe', raw_name='moderate_or_severe_liver_disease')

In [None]:
# Condition: Diabetes (without Chronic Complications)
categorial_numeric_encoding('condition_diabetes_without_chronic_complications', raw_name='diabetes_without_chronic_complications')

In [None]:
# Condition: Diabetes (with Chronic Complications)
categorial_numeric_encoding('condition_diabetes_with_chronic_complications', raw_name='diabetes_with_chronic_complications')

In [None]:
# Condition: Hemiplegia or Paraplegia
categorial_numeric_encoding('condition_hemiplegia_or_paraplegia', raw_name='hemiplegia_or_paraplegia')

In [None]:
# Condition: Renal Disease
categorial_numeric_encoding('condition_renal_disease', raw_name='renal_disease')

In [None]:
# Condition: Malignancy
categorial_numeric_encoding('condition_malignancy', raw_name='any_malignancy')

In [None]:
# Condition: Metastatic Solid Tumor
categorial_numeric_encoding('condition_metastatic_solid_tumor', raw_name='metastatic_solid_tumor')

In [None]:
# Condition: AIDS
categorial_numeric_encoding('condition_aids', raw_name='aids_hiv')

In [None]:
# Condition: Cardiac Arythmia
categorial_numeric_encoding('condition_cardiac_arythmia', raw_name='cardiac_arrythmias')

In [None]:
# Condition: Valvular Disease
categorial_numeric_encoding('condition_valvular_disease', raw_name='valvular_disease')

In [None]:
# Condition: Pulmonary Circulatory Disorder
categorial_numeric_encoding('condition_pulmonary_circulatory_disorder', raw_name='pulmonary_circulation_disorder')

In [None]:
# Condition: Arterial Hypertension
categorial_numeric_encoding('condition_arterial_hypertension', raw_name='arterial_hypertension')

In [None]:
# Condition: Other Neurological Disorders
categorial_numeric_encoding('condition_other_neurological_disorders', raw_name='other_neurological_disorders')

In [None]:
# Condition: Hypothyroidism
categorial_numeric_encoding('condition_hypothyroidism', raw_name='hypothyroidism')

In [None]:
# Condition: Coagulopathy
categorial_numeric_encoding('condition_coagulopathy', raw_name='coagulopathy')

In [None]:
# Condition: Obesity
categorial_numeric_encoding('condition_obesity', raw_name='obesity')

In [None]:
# Condition: Weight Loss
categorial_numeric_encoding('condition_weight_loss', raw_name='weight_loss')

In [None]:
# Condition: Fluid and Electrolyte Disorders
categorial_numeric_encoding('condition_fluid_and_electrolyte_disorders', raw_name='fluid_and_electrolyte_disorders')

In [None]:
# Condition: Blood Loss Anemia
categorial_numeric_encoding('condition_blood_loss_anemia', raw_name='blood_loss_anemia')

In [None]:
# Condition: Deficiency Anemia
categorial_numeric_encoding('condition_deficiency_anemia', raw_name='deficiency_anemia')

In [None]:
# Condition: Alcohol Abuse
categorial_numeric_encoding('condition_alcohol_abuse', raw_name='alcohol_abuse')

In [None]:
# Condition: Drug Abuse
categorial_numeric_encoding('condition_drug_abuse', raw_name='drug_abuse')

In [None]:
# Condition: Psychoses
categorial_numeric_encoding('condition_psychoses', raw_name='psychoses')

In [None]:
# Condition: Depression
categorial_numeric_encoding('condition_depression', raw_name='depression')

In [None]:
# Condition: Coronary Heart Disease
categorial_numeric_encoding('condition_coronary_heart_disease', raw_name='khk')

In [None]:
# Condition: Chronic Pancreatitis
categorial_numeric_encoding('condition_chronic_pancreatitis', raw_name='chronic_pancreatitis')

### Surgery

In [None]:
# Surgery: Primary System
categorial_one_hot_encoding('surgery_primary_system', raw_name='organ_system', mapping={'Esophagus': 'esophagus', 'Gastric': 'stomach', 'Colorectal': 'intestine', 'Small_intestine': 'intestine', 'Liver': 'liver', 'Pancreas': 'pancreas'}, not_supported=['Other_organ_system', 'Transplant'])

In [None]:
# Surgery: System Esophagus
categorial_numeric_encoding('surgery_system_esophagus', raw_name='surgery_system_esophagus')

In [None]:
# Surgery: System Stomach
categorial_numeric_encoding('surgery_system_stomach', raw_name='surgery_system_stomach')

In [None]:
# Surgery: System Intestine
categorial_numeric_encoding('surgery_system_intestine', raw_name='surgery_system_intestine')

In [None]:
# Surgery: System Liver
categorial_numeric_encoding('surgery_system_liver', raw_name='surgery_system_liver')

In [None]:
# Surgery: System Pancreas
categorial_numeric_encoding('surgery_system_pancreas', raw_name='surgery_system_pancreas')

In [None]:
# Surgery: System Count
numeric('surgery_system_count', raw_name='surgery_system_count', min=1, max=5)

In [None]:
# Surgery: Urgency
categorial_numeric_encoding('surgery_urgency', raw_name='urgency')

In [None]:
# Surgery: Follow Up
categorial_numeric_encoding('surgery_follow_up', raw_name='previous_op')

In [None]:
# Surgery: Month
raw['surgery_month'] = data['meta_incision'].copy().dt.month
categorial_numeric_encoding('surgery_month', raw_name='surgery_month')

In [None]:
# Surgery: Weekday
raw['surgery_weekday'] = data['meta_incision'].copy().dt.weekday
categorial_numeric_encoding('surgery_weekday', raw_name='surgery_weekday')

In [None]:
# Surgery: Daytime
raw['surgery_daytime'] = data['meta_incision'].copy().dt.hour
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 0 if (x >= 0 and x < 6) else x)
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 1 if (x >= 6 and x < 14) else x)
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 2 if (x >= 14 and x < 22) else x)
raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 0 if (x >= 22 and x < 24) else x)
categorial_numeric_encoding('surgery_daytime', raw_name='surgery_daytime')

In [None]:
# Surgery: Hour
raw['surgery_hour'] = data['meta_incision'].copy().dt.hour
categorial_numeric_encoding('surgery_hour', raw_name='surgery_hour')

In [None]:
# Surgery: Mean Monthly Temperature
raw['MittlereTemp.[°C]'] = raw['MittlereTemp.[°C]'].str.replace(',', '.')
numeric('surgery_mean_monthly_temperature', raw_name='MittlereTemp.[°C]')

In [None]:
# Surgery: Minimum Monthly Temperature
raw['MinimumTemp.[°C]'] = raw['MinimumTemp.[°C]'].str.replace(',', '.')
numeric('surgery_minimum_monthly_temperature', raw_name='MinimumTemp.[°C]')

In [None]:
# Surgery: Maximum Monthly Temperature
raw['MaximumTemp.[°C]'] = raw['MaximumTemp.[°C]'].str.replace(',', '.')
numeric('surgery_maximum_monthly_temperature', raw_name='MaximumTemp.[°C]')

In [None]:
# Surgery: Monthly Precipitation
raw['Nieder-schlag[l/m2]'] = raw['Nieder-schlag[l/m2]'].str.replace(',', '.')
numeric('surgery_monthly_precipitation', raw_name='Nieder-schlag[l/m2]')

In [None]:
# Surgery: Maximum Daily Precipitation
raw['RegenreichsterTag[l/m2]'] = raw['RegenreichsterTag[l/m2]'].str.replace(',', '.')
numeric('surgery_maximum_daily_precipitation', raw_name='RegenreichsterTag[l/m2]')

In [None]:
# Surgery: Monthly Sunshine Hours
raw['Sonnen-scheindauer[h]'] = raw['Sonnen-scheindauer[h]'].str.replace(',', '.')
numeric('surgery_monthly_sunshine_hours', raw_name='Sonnen-scheindauer[h]')

In [None]:
# Surgery: Monthly Summer Days
numeric('surgery_monthly_summer_days', raw_name='Sommer-tage')

In [None]:
# Surgery: Monthly Hot Days
numeric('surgery_monthly_hot_days', raw_name='HeißeTage')

In [None]:
# Surgery: Monthly Frost days
numeric('surgery_monthly_frost_days', raw_name='Frost-tage')

In [None]:
# Surgery: Monthly Ice Days
numeric('surgery_monthly_ice_days', raw_name='Eis-tage')

### Lab

In [None]:
# Lab: Sodium
numeric('lab_sodium', raw_name='pre_op_sodium')

In [None]:
# Lab: Potassium
numeric('lab_potassium', raw_name='pre_op_potassium')

In [None]:
# Lab: Albumin
numeric('lab_albumin', raw_name='pre_op_albumin')

In [None]:
# Lab: Bilirubin (Direct)
numeric('lab_bilirubin_direct', raw_name='pre_op_bilirubin_direct')

In [None]:
# Lab: Bilirubin (Indirect)
numeric('lab_bilirubin_indirect', raw_name='pre_op_bilirubin_indirect')

In [None]:
# Lab: Bilirubin
numeric('lab_bilirubin', raw_name='pre_op_bilirubin_total')

In [None]:
# Lab: Urea
numeric('lab_urea', raw_name='pre_op_urea')

In [None]:
# Lab: TSH
numeric('lab_tsh', raw_name='pre_op_tsh')

In [None]:
# Lab: ALT
numeric('lab_alt', raw_name='pre_op_alt')

In [None]:
# Lab: AST
numeric('lab_ast', raw_name='pre_op_ast')

In [None]:
# Lab: gGT
numeric('lab_ggt', raw_name='pre_op_ggt')

In [None]:
# Lab: Lipase
numeric('lab_lipase', raw_name='pre_op_lipase')

In [None]:
# Lab: CRP
numeric('lab_crp', raw_name='pre_op_crp')

In [None]:
# Lab: Lactate
numeric('lab_lactate', raw_name='pre_op_lactate')

In [None]:
# Lab: HbA1C
numeric('lab_hba1c', raw_name='pre_op_hba1c')

In [None]:
# Lab: Hemoglobin
numeric('lab_hemoglobin', raw_name='pre_op_hemoglobin')

In [None]:
# Lab: WBC
numeric('lab_wbc', raw_name='pre_op_wbc')

In [None]:
# Lab: Platelets
numeric('lab_platelets', raw_name='pre_op_platelets')

In [None]:
# Lab: Hematocrit
numeric('lab_hematocrit', raw_name='pre_op_hematocrit')

In [None]:
# Lab: Prothrombin Time
numeric('lab_prothrombin_time', raw_name='pre_op_prothrombin_time')

In [None]:
# Lab: INR
numeric('lab_inr', raw_name='pre_op_inr')

In [None]:
# Lab: aPTT
numeric('lab_aptt', raw_name='pre_op_aptt')

In [None]:
# Lab: RBC
numeric('lab_rbc', raw_name='pre_op_erythrocytes', min=0)

In [None]:
# Lab: Creatinine
numeric('lab_creatinine', raw_name='pre_op_creatinine')

In [None]:
# Lab: Glucose
numeric('lab_glucose', raw_name='pre_op_glucose')

In [None]:
# Lab: Alpha Fetoprotein
numeric('lab_alpha_fetoprotein', raw_name='pre_op_alpha-fetoprotein')

In [None]:
# Lab: CA 19-9
numeric('lab_ca_19_9', raw_name='pre_op_ca_19-9')

In [None]:
# Lab: CA 125
numeric('lab_ca_125', raw_name='pre_op_ca_125')

In [None]:
# Lab: CEA
numeric('lab_cea', raw_name='pre_op_cea')

In [None]:
# Lab: BGA (pH)
numeric('lab_abg_ph', raw_name='pre_op_abg_ph')

In [None]:
# Lab: BGA (Base Excess)
numeric('lab_bga_be', raw_name='pre_op_abg_base_excess')

In [None]:
# Lab: BGA (Bicarbonate)
numeric('lab_bga_bicarbonate', raw_name='pre_op_abg_bicarbonate')

## Target

In [None]:
# Target: 30 Day Mortality
categorial_numeric_encoding('target_30_day_mortality', raw_name='30_day_mortality')

In [None]:
# Target: 90 Day Mortality
categorial_numeric_encoding('target_90_day_mortality', raw_name='90_day_mortality')

In [None]:
# Target: Clavien Dindo 5
# categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')

In [None]:
# Target: Deceased after Discharge
# categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')

In [None]:
# Target: Acute Myocardial Infarction
# categorial_numeric_encoding('target_acute_myocardial_infarction', raw_name='acute_myocardial_infarction')

In [None]:
# Target: Pulmonary Embolism
# categorial_numeric_encoding('target_pulmonary embolism', raw_name='pulmonary_embolism')

In [None]:
# Target: Septic Shock
# categorial_numeric_encoding('target_septic_shock', raw_name='setic_shock')

In [None]:
# Target: Pneumonia
# categorial_numeric_encoding('target_pneumonia', raw_name='pneumonia')

In [None]:
# Target: Liver Failure
# categorial_numeric_encoding('target_liver_failure', raw_name='liver_failure')

In [None]:
# Target: Cardiogenic Shock
# categorial_numeric_encoding('target_cardiogenic_shock', raw_name='cardiogenic_shock')

In [None]:
# Target: Acute Pancreatitis
# categorial_numeric_encoding('target_acute_pancreatitis', raw_name='acute_pancreatitis')

In [None]:
# Target: Anastomotic Leackage
# categorial_numeric_encoding('target_anastomotic_leackage', raw_name='anastomotic_leackage')

# Process

In [None]:
data = data[data['meta_follow_up'] == 0]
data

## Inclusion Criteria

In [None]:
# Year
data = data[data['meta_year'].notna()]
data

In [None]:
# System
data = data[data['meta_system'].notna()]
data

In [None]:
# OPS
data = data[data['meta_ops'] == 1]
data

In [None]:
# Age
data = data[data['meta_age'] >= 18]
data

## Column Management

In [None]:
center0 = data[data['meta_center'] == 0]
for column in center0.columns:
    if column == 'meta_center': continue
    if center0[column].nunique() <= 1:
        data.drop(column, axis=1, inplace=True)
        print(column)

data

In [None]:
completeness = data.count() / len(data) * 100
completeness = completeness.apply(lambda x: round(x / 10) * 10)
completeness = completeness.value_counts().sort_index()
completeness.plot.bar()

In [None]:
center0 = data[data['meta_center'] == 0]
for column in center0.columns:
    if center0[column].isna().sum() / len(center0) > 0.5:
        data.drop(column, axis=1, inplace=True)
        print(column)

data

## Exclusion Criteria

In [None]:
# Urgency
data = data[data['meta_urgency'] >= 4]
data

In [None]:
# 30 Day Mortality
data = data[data['target_30_day_mortality'].notna()]
data

In [None]:
# 90 Day Mortality
data = data[data['target_90_day_mortality'].notna()]
data

In [None]:
# Visualize the completeness of the different cases.
completeness = data.count(axis=1) / len(data.columns) * 100
completeness = completeness.apply(lambda x: round(x / 10) * 10)
completeness = completeness.value_counts().sort_index()
completeness.plot.bar()

In [None]:
# 75% Completeness
data.dropna(thresh=len(data.columns) * 0.75, inplace=True)
data

## Management

In [None]:
# Sort
data.sort_values(by=['meta_incision'], inplace=True)
data

In [None]:
# Duplicates
data.drop_duplicates(subset='meta_case', inplace=True)
data

In [None]:
# Create Dummy Features for One Hot Encoding for all columns not being numeric
data = pd.get_dummies(data, dummy_na=False)
data

# Export

In [None]:
data.to_csv('4_clean.csv', index = False)
data

In [None]:
completeness: pd.DataFrame = pd.DataFrame(columns=['column', 'count', 'percentage'])
for column in data.columns:
    data_total = data[column]
    data_training = data[data['meta_center'] == 0][column]
    data_validation = data[data['meta_center'] == 1][column]

    completeness = completeness.append({
        'column': column, 
        'count': data_total.count(), 
        'data_percentage': round(data_total.count() / len(data_total) * 100, 2),
        'training_percentage': round(data_training.count() / len(data_training) * 100, 2),
        'validation_percentage': round(data_validation.count() / len(data_validation) * 100, 2)
    }, ignore_index=True)

completeness.to_csv('5_analysis.csv')
completeness