# Challenges in Predicting Kidney Transplant Rejection Using Clinical and Donor Data

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from typing import List, Tuple, Dict, Any

from IPython.display import display, HTML

pd.set_option('display.max_columns', None)

In [2]:
df_origin = pd.read_csv('data/rejeicao_todos.csv', sep=';')
df = df_origin.copy()
df

Unnamed: 0,No,Idade,data_nascimento,Peso,Altura,superfície corporal,Cor,SEXO,TIPO DIAL,D. BASE,PRA I,re_tx,Doador vivo ou falecido,Idade Doador,SEXO Doador,COR Doador,Peso Doador,Altura Doador,IMC DOADOR,superficie corporal doador,Causa Morte do doador,HAS doador,DM doador,Na final Doador,CPK final Doador,Cr final do doador,dvadoador,HCV,Data Tx,TIF,Tempo de anastomose total,Indução,ImunINIC,Mismatch,KDPI,KDRI,Rejeição,DGF,obs,Id
0,1.0,32,10-03-77 0:00,87.2,167.0,0.0,1,2,1.0,0,0.0,0.0,0,26,1.0,0.0,,,,,,0,0,,,1.36,0,,01/13/1200,21.6,35,1.0,3.0,6.0,,,1.0,0.0,,0
1,2.0,25,03-07-84 0:00,46.6,156.0,1.43,0,2,1.0,2,,0.0,0,29,1.0,0.0,,,,,,0,0,,,1.36,0,,01/27/2010,21.6,35,0.0,3.0,3.0,,,1.0,0.0,,1
2,3.0,30,06/21/79,49.0,175.0,1.59,0,2,1.0,2,0.0,0.0,1,50,1.0,2.0,75.0,172.0,25.4,,1.0,1,0,146.0,181.0,0.87,1,0.0,28-01-10 0:00,22.0,65,1.0,3.0,3.0,0.75,1.28,0.0,0.0,,2
3,4.0,45,06/28/64,,,0.0,1,2,1.0,2,98.0,1.0,1,18,2.0,0.0,,,,,,0,0,,,1.36,0,,02-07-10 0:00,21.6,48,2.0,3.0,0.0,,,0.0,0.0,,3
4,5.0,55,Palni,94.0,176.0,2.1,0,1,1.0,1,,0.0,0,45,2.0,0.0,,,,,,0,0,,,1.36,0,,02-10-10 0:00,21.6,50,0.0,3.0,6.0,,,1.0,0.0,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,,44,02-09-76 0:00,,,0.0,1,1,,0,0.0,,1,19,,,,,,,0.0,0,0,,,1.10,0,,05-09-20 0:00,19.0,48,2.0,2.0,2.0,,,0.0,,,1250
1251,,34,10-07-86 0:00,,,0.0,0,2,,2,0.0,,1,35,,,,,,,0.0,0,0,,,1.90,0,,09-09-20 0:00,22.0,48,2.0,2.0,4.0,,,0.0,,,1251
1252,,59,07-10-61 0:00,,,0.0,0,1,,0,0.0,,1,35,,,,,,,1.0,0,0,,,1.67,1,,10-09-20 0:00,13.0,48,2.0,2.0,3.0,,,0.0,,,1252
1253,,35,04-09-85 0:00,,,0.0,0,2,,3,83.0,,1,27,,,,,,,1.0,0,0,,,0.80,0,,12-09-20 0:00,11.0,48,,,3.0,,,0.0,,,1253


In [3]:
rename_dict = {
    'No': 'id_',
    'Id': 'id',
    'Idade': 'recipientAge',
    'data_nascimento': 'recipientBirthdate',
    'Peso': 'recipientWeight',
    'Altura': 'recipientHeight',
    'superfície corporal': 'recipientBodySurface',
    'Cor': 'recipientColor',
    'SEXO': 'recipientSex',
    'TIPO DIAL': 'dialysisType',
    'D. BASE': 'underlyingDisease',
    'PRA I': 'praI', # class I antibody reactivity panel
    're_tx': 'retransplant',
    'Doador vivo ou falecido': 'isDonorAlive',
    'Idade Doador': 'donorAge',
    'SEXO Doador': 'donorSex',
    'COR Doador': 'donorColor',
    'Peso Doador': 'donorWeight',
    'Altura Doador': 'donorHeight',
    'IMC DOADOR': 'donorBmi',
    'superficie corporal doador': 'donorBodySurface',
    'Causa Morte do doador': 'donorDeathCause',
    'HAS doador': 'donorHypertension', # Systemic arterial hypertension
    'DM doador': 'donorDiabetesMellitus', # Diabetes mellitus
    'Na final Doador': 'donorFinalSodium',
    'CPK final Doador': 'donorFinalCpk', # creatine phosphokinase
    'Cr final do doador': 'donorFinalCreatinine',
    'dvadoador': 'donorUsedVasoactiveDrug',
    'HCV': 'donorHepatitisCVirus', # Hepatitis C virus
    'Data Tx': 'transplantDate',
    'TIF': 'coldIschemiaTimeMin', # Total ischemia time (min)
    'Tempo de anastomose total': 'totalAnastomosisTimeMin', # Total anastomosis time (min)
    'Indução': 'inductionType',
    'ImunINIC': 'initialImmunization',
    'Mismatch': 'mismatch',
    'KDPI': 'kidneyDonorProfileIndex', # Kidney Donor Profile Index
    'KDRI': 'kidneyDonorRiskIndex', # Kidney Donor Risk Index
    'Rejeição': 'rejected',
    'DGF': 'delayedGraftFunction', # Delayed graft function
    'obs': 'obs',
}

df = df.rename(columns=rename_dict)
df

Unnamed: 0,id_,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorColor,donorWeight,donorHeight,donorBmi,donorBodySurface,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,kidneyDonorProfileIndex,kidneyDonorRiskIndex,rejected,delayedGraftFunction,obs,id
0,1.0,32,10-03-77 0:00,87.2,167.0,0.0,1,2,1.0,0,0.0,0.0,0,26,1.0,0.0,,,,,,0,0,,,1.36,0,,01/13/1200,21.6,35,1.0,3.0,6.0,,,1.0,0.0,,0
1,2.0,25,03-07-84 0:00,46.6,156.0,1.43,0,2,1.0,2,,0.0,0,29,1.0,0.0,,,,,,0,0,,,1.36,0,,01/27/2010,21.6,35,0.0,3.0,3.0,,,1.0,0.0,,1
2,3.0,30,06/21/79,49.0,175.0,1.59,0,2,1.0,2,0.0,0.0,1,50,1.0,2.0,75.0,172.0,25.4,,1.0,1,0,146.0,181.0,0.87,1,0.0,28-01-10 0:00,22.0,65,1.0,3.0,3.0,0.75,1.28,0.0,0.0,,2
3,4.0,45,06/28/64,,,0.0,1,2,1.0,2,98.0,1.0,1,18,2.0,0.0,,,,,,0,0,,,1.36,0,,02-07-10 0:00,21.6,48,2.0,3.0,0.0,,,0.0,0.0,,3
4,5.0,55,Palni,94.0,176.0,2.1,0,1,1.0,1,,0.0,0,45,2.0,0.0,,,,,,0,0,,,1.36,0,,02-10-10 0:00,21.6,50,0.0,3.0,6.0,,,1.0,0.0,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,,44,02-09-76 0:00,,,0.0,1,1,,0,0.0,,1,19,,,,,,,0.0,0,0,,,1.10,0,,05-09-20 0:00,19.0,48,2.0,2.0,2.0,,,0.0,,,1250
1251,,34,10-07-86 0:00,,,0.0,0,2,,2,0.0,,1,35,,,,,,,0.0,0,0,,,1.90,0,,09-09-20 0:00,22.0,48,2.0,2.0,4.0,,,0.0,,,1251
1252,,59,07-10-61 0:00,,,0.0,0,1,,0,0.0,,1,35,,,,,,,1.0,0,0,,,1.67,1,,10-09-20 0:00,13.0,48,2.0,2.0,3.0,,,0.0,,,1252
1253,,35,04-09-85 0:00,,,0.0,0,2,,3,83.0,,1,27,,,,,,,1.0,0,0,,,0.80,0,,12-09-20 0:00,11.0,48,,,3.0,,,0.0,,,1253


In [4]:
map_categories = {
    "recipientColor": {0: 'white', 1: 'brown', 2: 'black', 3: 'yellow'},
    "donorColor": {0: 'white', 1: 'brown', 2: 'black', 3: 'yellow'},
    "recipientSex": {1: 'male', 2: 'female'},
    "donorSex": {1: 'male', 2: 'female'},
    "dialysisType": {0: 'conservador', 1: 'hemodialise', 2: 'dialise_peritoneal'},
    "underlyingDisease": {0: 'hypertension', 1: 'diabetes', 2: 'glomerulopathy', 3: 'indeterminate', 4: 'urological', 5: 'others'},
    "retransplant": {0: False, 1: True},
    "isDonorAlive": {0: True, 1: False},
    "donorDeathCause": {0: 'tce', 1: 'avci', 2: 'hsa', 3: 'others'},
    "donorHypertension": {0: False, 1: True},
    "donorDiabetesMellitus": {0: False, 1: True},
    "donorUsedVasoactive_drug": {0: False, 1: True},
    "donorHepatitisCVirus": {0: False, 1: True},
    "inductionType": {0: 'no_induction', 1: 'basiliximab (simulet)', 2: 'thymoglobulin'},
    "initialImmunization": {1: 'CNI+AZA+PRED', 2: 'CNI+imTOR+PRED', 3: 'CNI+MMF+PRED', 4: 'others'},
    "rejected": {0: False, 1: True},
    "delayedGraftFunction": {0: False, 1: True},
}

df = df.replace(map_categories)
df

Unnamed: 0,id_,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorColor,donorWeight,donorHeight,donorBmi,donorBodySurface,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,kidneyDonorProfileIndex,kidneyDonorRiskIndex,rejected,delayedGraftFunction,obs,id
0,1.0,32,10-03-77 0:00,87.2,167.0,0.0,brown,female,hemodialise,hypertension,0.0,False,True,26,male,white,,,,,,False,False,,,1.36,0,,01/13/1200,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,,,True,False,,0
1,2.0,25,03-07-84 0:00,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29,male,white,,,,,,False,False,,,1.36,0,,01/27/2010,21.6,35,no_induction,CNI+MMF+PRED,3.0,,,True,False,,1
2,3.0,30,06/21/79,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50,male,black,75.0,172.0,25.4,,avci,True,False,146.0,181.0,0.87,1,False,28-01-10 0:00,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,0.75,1.28,False,False,,2
3,4.0,45,06/28/64,,,0.0,brown,female,hemodialise,glomerulopathy,98.0,True,False,18,female,white,,,,,,False,False,,,1.36,0,,02-07-10 0:00,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,,,False,False,,3
4,5.0,55,Palni,94.0,176.0,2.1,white,male,hemodialise,diabetes,,False,True,45,female,white,,,,,,False,False,,,1.36,0,,02-10-10 0:00,21.6,50,no_induction,CNI+MMF+PRED,6.0,,,True,False,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,,44,02-09-76 0:00,,,0.0,brown,male,,hypertension,0.0,,False,19,,,,,,,tce,False,False,,,1.10,0,,05-09-20 0:00,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,,,False,,,1250
1251,,34,10-07-86 0:00,,,0.0,white,female,,glomerulopathy,0.0,,False,35,,,,,,,tce,False,False,,,1.90,0,,09-09-20 0:00,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,,,False,,,1251
1252,,59,07-10-61 0:00,,,0.0,white,male,,hypertension,0.0,,False,35,,,,,,,avci,False,False,,,1.67,1,,10-09-20 0:00,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,,,False,,,1252
1253,,35,04-09-85 0:00,,,0.0,white,female,,indeterminate,83.0,,False,27,,,,,,,avci,False,False,,,0.80,0,,12-09-20 0:00,11.0,48,,,3.0,,,False,,,1253


## Fix some values

In [5]:
df.loc[99, 'donorHeight'] = df.loc[99, 'donorHeight'] * 100 # convert to cm

In [6]:
# swap donorHeight and donorWeight for index 734
height_i = df.loc[734, 'donorHeight']
df.loc[734, 'donorHeight'] =  df.loc[734, 'donorWeight']
df.loc[734, 'donorWeight'] = height_i

## Null Values

In [7]:
df.dropna(subset=['rejected'], inplace=True)
df['rejected'] = df['rejected'].astype(bool)

In [8]:
df_null = pd.DataFrame({'Null sum': df.isnull().sum(), 'Null mean': df.isnull().mean()})
display(df_null)

# plot for missing values
px.bar(df_null, x=df_null.index, y='Null mean', title='Missing values Before', labels={'index': 'Columns', 'Null mean': 'Null mean'}).update_layout(yaxis_range=[0,1]).show()

# drop columns with more than 30% of missing values (keep at least 70% of non-null values)
min_non_nulls = int(df.shape[0] * 0.7) 
df = df.dropna(thresh=min_non_nulls, axis='columns')

df_null = pd.DataFrame({'Null sum': df.isnull().sum(), 'Null mean': df.isnull().mean()})
px.bar(df_null, x=df_null.index, y='Null mean', title='Missing values After', labels={'index': 'Columns', 'Null mean': 'Null mean'}).update_layout(yaxis_range=[0,1]).show()

Unnamed: 0,Null sum,Null mean
id_,421,0.357689
recipientAge,0,0.0
recipientBirthdate,0,0.0
recipientWeight,60,0.050977
recipientHeight,60,0.050977
recipientBodySurface,2,0.001699
recipientColor,0,0.0
recipientSex,0,0.0
dialysisType,48,0.040782
underlyingDisease,0,0.0


## Dates

In [9]:
def clean_date(df, date_col: str):
    dates = pd.to_datetime(df[date_col], errors='coerce')
    df[date_col] = pd.to_datetime(dates.dt.strftime('%Y-%m-%d'))
    df[date_col] = df[date_col].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 2021 else x)
    return df[date_col]


In [10]:
import datetime

px.histogram(df['recipientBirthdate'], title='Recipient Birthdate Before Cleaning').update_layout(xaxis_range=[datetime.date(1930,1,1), datetime.date(2030,12,31)]).show()
df['recipientBirthdate'] = clean_date(df, 'recipientBirthdate')
px.histogram(df['recipientBirthdate'], title='Recipient Birthdate After Cleaning').update_layout(xaxis_range=[datetime.date(1930,1,1), datetime.date(2030,12,31)]).show()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
px.histogram(df['transplantDate'], title='Transplant Date Before Cleaning').update_layout(xaxis_range=[datetime.date(2001,1,1), datetime.date(2031,12,31)]).show()
df['transplantDate'] = clean_date(df, 'transplantDate')
px.histogram(df['transplantDate'], title='Transplant Date After Cleaning').update_layout(xaxis_range=[datetime.date(2001,1,1), datetime.date(2031,12,31)]).show()

px.histogram(df['transplantDate'], title='Transplant by Month').update_traces(xbins_size="M1").show()


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
df['transplantDate'].sort_values()

1      2010-01-27
2      2010-01-28
3      2010-02-07
4      2010-02-10
5      2010-02-20
          ...    
1235   2020-12-08
1234   2020-12-08
1237   2020-12-08
1253   2020-12-09
0             NaT
Name: transplantDate, Length: 1177, dtype: datetime64[ns]

## Correct data types


In [13]:
df1 = df.copy()

In [14]:
px.histogram(df, x='coldIschemiaTimeMin', title='Cold Ischemia Time (min) - Before Cleaning').update_layout(xaxis_range=[0,40], yaxis_range=[0,330]).show()

# coldIschemiaTimeMin
for index, row in df.iterrows():
    try:
        float(row['coldIschemiaTimeMin'])
    except ValueError:
        df.at[index, 'coldIschemiaTimeMin'] = np.nan
df['coldIschemiaTimeMin'] = df['coldIschemiaTimeMin'].str.replace(',', '.').astype(float)

px.histogram(df, x='coldIschemiaTimeMin', title='Cold Ischemia Time (min) - After Cleaning').update_layout(xaxis_range=[0,40], yaxis_range=[0,330]).show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
px.histogram(df, x='recipientBodySurface', title='Recipient Body Surface - Before Cleaning').update_layout(xaxis_range=[0,3], yaxis_range=[0,220]).update_xaxes(categoryorder='total ascending').show()

# recipientBodySurface
# for each row ignore after the second dot
for index, row in df.iterrows():
    try:
        float(row['recipientBodySurface'])
    except ValueError:
        splitted = str(row['recipientBodySurface']).split('.')
        value = float('.'.join(splitted[:2]))
        if value > 3:
            df.at[index, 'recipientBodySurface'] = np.nan
        else:
            df.at[index, 'recipientBodySurface'] = value

df['recipientBodySurface'] = df['recipientBodySurface'].astype(float)
px.histogram(df, x='recipientBodySurface', title='Recipient Body Surface - After Cleaning').update_layout(xaxis_range=[0,3], yaxis_range=[0,220]).update_xaxes(categoryorder='total ascending').show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
# donorAge
for index, row in df.iterrows():
    try:
        float(row['donorAge'])
    except ValueError:
        df.at[index, 'donorAge'] = np.nan
df['donorAge'] = df['donorAge'].astype(float)

# donorBmi
for index, row in df.iterrows():
    try:
        float(row['donorBmi'])
    except ValueError:
        df.at[index, 'donorBmi'] = np.nan
df['donorBmi'] = df['donorBmi'].astype(float)

df['donorUsedVasoactiveDrug'] = df['donorUsedVasoactiveDrug'].astype(bool)
df['donorSex'] = df['donorSex'].replace({0: np.nan})
df['inductionType'] = df['inductionType'].replace({3: np.nan})



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

## Outliers

Apply univariate outlier detection using IQR, Z-score, modified Z-score, Isolation Forest, DBSCAN

In [17]:
# identify outliers using IQR and add a column to the dataframe to mark them
df['outlier_iqr'] = [[]] * len(df)

def identify_outliers(df, column: str):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    df.loc[outliers.index, 'outlier_iqr'] = df.loc[outliers.index, 'outlier_iqr'].apply(lambda x: x + [column])
    return df
    

numerical_columns = df.select_dtypes(include=[np.number]).columns
for column in numerical_columns:
    df = identify_outliers(df, column)

df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr
0,32,1977-10-03,87.2,167.0,0.00,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,NaT,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,0,[recipientBodySurface]
1,25,1984-03-07,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[]
2,30,1979-06-21,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[]
3,45,1964-06-28,,,0.00,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,2010-02-07,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,3,"[recipientBodySurface, praI]"
4,55,NaT,94.0,176.0,2.10,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,1976-02-09,,,0.00,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,2020-05-09,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,1250,[recipientBodySurface]
1251,34,1986-10-07,,,0.00,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,2020-09-09,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1251,[recipientBodySurface]
1252,59,1961-07-10,,,0.00,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,2020-10-09,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1252,"[recipientBodySurface, coldIschemiaTimeMin]"
1253,35,1985-04-09,,,0.00,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,2020-12-09,11.0,48,,,3.0,False,,1253,"[recipientBodySurface, praI, coldIschemiaTimeMin]"


In [18]:
# identify outliers using Z-score and add a column to the dataframe to mark them
df['outlier_zscore'] = [[]] * len(df)

def identify_outliers_zscore(df, column: str):
    z = np.abs((df[column] - df[column].mean()) / df[column].std())
    outliers = df[z > 3]
    df.loc[outliers.index, 'outlier_zscore'] = df.loc[outliers.index, 'outlier_zscore'].apply(lambda x: x + [column])
    return df

for column in numerical_columns:
    df = identify_outliers_zscore(df, column)

df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore
0,32,1977-10-03,87.2,167.0,0.00,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,NaT,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,0,[recipientBodySurface],[recipientBodySurface]
1,25,1984-03-07,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[],[]
2,30,1979-06-21,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[]
3,45,1964-06-28,,,0.00,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,2010-02-07,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,3,"[recipientBodySurface, praI]","[recipientBodySurface, praI]"
4,55,NaT,94.0,176.0,2.10,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,1976-02-09,,,0.00,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,2020-05-09,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,1250,[recipientBodySurface],[recipientBodySurface]
1251,34,1986-10-07,,,0.00,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,2020-09-09,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1251,[recipientBodySurface],[recipientBodySurface]
1252,59,1961-07-10,,,0.00,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,2020-10-09,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1252,"[recipientBodySurface, coldIschemiaTimeMin]",[recipientBodySurface]
1253,35,1985-04-09,,,0.00,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,2020-12-09,11.0,48,,,3.0,False,,1253,"[recipientBodySurface, praI, coldIschemiaTimeMin]",[recipientBodySurface]


In [19]:
# identify outliers using Isolation Forest and add a column to the dataframe to mark them
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer

df['outlier_isolation_forest'] = [[]] * len(df)

def identify_outliers_isolation_forest(df, column: str):
    clf = IsolationForest(contamination=0.05)
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    col_values = imp_median.fit_transform(df[column].values.reshape(-1, 1))

    clf.fit(col_values)
    outliers = clf.predict(col_values)
    outliers = df[outliers == -1]
    df.loc[outliers.index, 'outlier_isolation_forest'] = df.loc[outliers.index, 'outlier_isolation_forest'].apply(lambda x: x + [column])
    return df

for column in numerical_columns:
    df = identify_outliers_isolation_forest(df, column)

df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore,outlier_isolation_forest
0,32,1977-10-03,87.2,167.0,0.00,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,NaT,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,0,[recipientBodySurface],[recipientBodySurface],"[mismatch, id]"
1,25,1984-03-07,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[],[],[id]
2,30,1979-06-21,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[],[id]
3,45,1964-06-28,,,0.00,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,2010-02-07,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,3,"[recipientBodySurface, praI]","[recipientBodySurface, praI]","[praI, id]"
4,55,NaT,94.0,176.0,2.10,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[],[],"[mismatch, id]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,1976-02-09,,,0.00,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,2020-05-09,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,1250,[recipientBodySurface],[recipientBodySurface],[id]
1251,34,1986-10-07,,,0.00,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,2020-09-09,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1251,[recipientBodySurface],[recipientBodySurface],[id]
1252,59,1961-07-10,,,0.00,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,2020-10-09,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1252,"[recipientBodySurface, coldIschemiaTimeMin]",[recipientBodySurface],[id]
1253,35,1985-04-09,,,0.00,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,2020-12-09,11.0,48,,,3.0,False,,1253,"[recipientBodySurface, praI, coldIschemiaTimeMin]",[recipientBodySurface],"[coldIschemiaTimeMin, id]"


In [20]:
# identify outliers using DBSCAN and add a column to the dataframe to mark them
from sklearn.cluster import DBSCAN

df['outlier_dbscan'] = [[]] * len(df)

def identify_outliers_dbscan(df, column: str):
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    col_values = imp_median.fit_transform(df[column].values.reshape(-1, 1))

    outliers = dbscan.fit_predict(col_values)
    outliers = df[outliers == -1]
    df.loc[outliers.index, 'outlier_dbscan'] = df.loc[outliers.index, 'outlier_dbscan'].apply(lambda x: x + [column])
    return df

for column in numerical_columns:
    df = identify_outliers_dbscan(df, column)

df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore,outlier_isolation_forest,outlier_dbscan
0,32,1977-10-03,87.2,167.0,0.00,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,NaT,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,0,[recipientBodySurface],[recipientBodySurface],"[mismatch, id]",[id]
1,25,1984-03-07,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[],[],[id],[id]
2,30,1979-06-21,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[],[id],"[donorFinalCpk, id]"
3,45,1964-06-28,,,0.00,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,2010-02-07,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,3,"[recipientBodySurface, praI]","[recipientBodySurface, praI]","[praI, id]",[id]
4,55,NaT,94.0,176.0,2.10,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[],[],"[mismatch, id]",[id]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,1976-02-09,,,0.00,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,2020-05-09,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,1250,[recipientBodySurface],[recipientBodySurface],[id],[id]
1251,34,1986-10-07,,,0.00,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,2020-09-09,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1251,[recipientBodySurface],[recipientBodySurface],[id],[id]
1252,59,1961-07-10,,,0.00,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,2020-10-09,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1252,"[recipientBodySurface, coldIschemiaTimeMin]",[recipientBodySurface],[id],[id]
1253,35,1985-04-09,,,0.00,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,2020-12-09,11.0,48,,,3.0,False,,1253,"[recipientBodySurface, praI, coldIschemiaTimeMin]",[recipientBodySurface],"[coldIschemiaTimeMin, id]",[id]


In [21]:
gaussian_features = ['recipientAge', 'recipientWeight', 'recipientHeight', 'recipientBodySurface', 'donorAge', 'donorWeight', 'donorHeight', 'donorBmi', 'donorFinalSodium', 'donorFinalCpk', 'donorFinalCreatinine', 'coldIschemiaTimeMin', 'totalAnastomosisTimeMin', 'donorBmi', 'donorFinalSodium', 'coldIschemiaTimeMin', 'totalAnastomosisTimeMin', 'mismatch']
skewed_features = ['praI', 'donorFinalCpk', 'donorFinalCreatinine']

In [22]:
df['donorFinalCreatinine'].sort_values()

1006    0.19
369     0.20
280     0.20
97      0.20
246     0.21
        ... 
1173    5.70
871     6.60
617     7.40
616     7.40
1155    9.80
Name: donorFinalCreatinine, Length: 1175, dtype: float64

In [23]:
df['donorBmi'].sort_values().unique()

array([1.160000e+01, 1.300000e+01, 1.360000e+01, 1.370000e+01,
       1.400000e+01, 1.410000e+01, 1.430000e+01, 1.460000e+01,
       1.490000e+01, 1.510000e+01, 1.530000e+01, 1.600000e+01,
       1.640000e+01, 1.690000e+01, 1.700000e+01, 1.760000e+01,
       1.800000e+01, 1.820000e+01, 1.830000e+01, 1.850000e+01,
       1.860000e+01, 1.870000e+01, 1.900000e+01, 1.910000e+01,
       1.920000e+01, 1.940000e+01, 1.950000e+01, 1.960000e+01,
       1.970000e+01, 1.980000e+01, 2.000000e+01, 2.020000e+01,
       2.030000e+01, 2.040000e+01, 2.050000e+01, 2.070000e+01,
       2.080000e+01, 2.090000e+01, 2.100000e+01, 2.110000e+01,
       2.120000e+01, 2.130000e+01, 2.150000e+01, 2.160000e+01,
       2.170000e+01, 2.180000e+01, 2.200000e+01, 2.210000e+01,
       2.220000e+01, 2.230000e+01, 2.240000e+01, 2.250000e+01,
       2.260000e+01, 2.290000e+01, 2.300000e+01, 2.310000e+01,
       2.330000e+01, 2.340000e+01, 2.350000e+01, 2.360000e+01,
       2.370000e+01, 2.380000e+01, 2.390000e+01, 2.4000

## Distribution with outliers

In [24]:
num_cols =  df.select_dtypes(include='number').columns.to_list()
num_cols = [col for col in num_cols if col not in ['id_', 'id']]
cat_cols = df.select_dtypes(include='object').columns.to_list()
bin_cols = df.select_dtypes(include='bool').columns.to_list()

In [25]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plot_num_cols = num_cols.copy()
num_cols_per_row = 2
num_rows = (len(plot_num_cols) + num_cols_per_row - 1) // num_cols_per_row

# Create subplots
fig = make_subplots(rows=num_rows, cols=num_cols_per_row, subplot_titles=plot_num_cols)

# Plot histograms for each numeric column
for i, col in enumerate(plot_num_cols):
    row_i = i // num_cols_per_row + 1
    col_i = i % num_cols_per_row + 1
    fig.add_trace(go.Histogram(x=df[col], name=col), row=row_i, col=col_i)


fig.update_layout(height=1400, width=1400, title_text="Histograms of Numeric Columns", showlegend=False)
fig.show()


In [26]:
plot_cat_cols = cat_cols.copy()
num_cols_per_row = 2
num_rows = (len(plot_cat_cols) + num_cols_per_row - 1) // num_cols_per_row

# Create subplots
fig = make_subplots(rows=num_rows, cols=num_cols_per_row, subplot_titles=plot_cat_cols)

# Plot histograms for each numeric column
for i, col in enumerate(plot_cat_cols):
    row_i = i // num_cols_per_row + 1
    col_i = i % num_cols_per_row + 1
    fig.add_trace(go.Histogram(x=df[col], name=col), row=row_i, col=col_i)


fig.update_layout(height=1400, width=1400, title_text="Histograms of Category Columns", showlegend=False)
fig.show()

## Distribution without outliers

In [27]:
df_no_outliers = df.copy()
for index, row in df.iterrows():
    if row['outlier_iqr']:
        df_no_outliers.drop(index, inplace=True)


plot_num_cols = num_cols.copy()
num_cols_per_row = 2
num_rows = (len(plot_num_cols) + num_cols_per_row - 1) // num_cols_per_row

# Create subplots
fig = make_subplots(rows=num_rows, cols=num_cols_per_row, subplot_titles=plot_num_cols)

# Plot histograms for each numeric column
for i, col in enumerate(plot_num_cols):
    row_i = i // num_cols_per_row + 1
    col_i = i % num_cols_per_row + 1
    fig.add_trace(go.Histogram(x=df_no_outliers[col], name=col), row=row_i, col=col_i)


fig.update_layout(height=1400, width=1400, title_text="Histograms of Numeric Columns", showlegend=False)
fig.show()


## Correlation

In [28]:
num_cols =  ['recipientAge', 'recipientWeight', 'recipientHeight', 'recipientBodySurface', 'praI', 'donorAge', 'donorWeight', 'donorHeight', 'donorBmi', 'donorFinalSodium', 'donorFinalCpk', 'donorFinalCreatinine', 'coldIschemiaTimeMin', 'totalAnastomosisTimeMin', 'mismatch']
cat_cols = ['recipientColor', 'recipientSex', 'dialysisType', 'underlyingDisease', 'donorSex', 'donorDeathCause', 'inductionType', 'initialImmunization']
bin_cols = ['retransplant', 'donorHepatitisCVirus', 'delayedGraftFunction', 'isDonorAlive', 'donorHypertension', 'donorDiabetesMellitus', 'donorUsedVasoactiveDrug', 'rejected']

In [29]:
data = df[num_cols + bin_cols]

fig = go.Figure(data=go.Heatmap(z=data.corr(), x=data.corr().columns, y=data.corr().columns))
fig.update_layout(title='Correlation Heatmap for Numeric Columns', yaxis_nticks=len(data.corr().columns), height=1000, width=1000)
fig.show()

In [30]:
df_dummies = pd.get_dummies(df[num_cols + bin_cols + cat_cols], columns=cat_cols)

fig = go.Figure(data=go.Heatmap(z=df_dummies.corr(), x=df_dummies.corr().columns, y=df_dummies.corr().columns))
fig.update_layout(title='Correlation Heatmap for Numeric Columns', yaxis_nticks=len(df_dummies.columns), height=1600, width=1600)
fig.show()

In [31]:
df_dummies = pd.get_dummies(df[num_cols + bin_cols + cat_cols], columns=cat_cols).corr()

# filter values above 0.5
df_dummies = df_dummies[abs(df_dummies) > 0.3]
df_dummies = df_dummies.dropna(how='all', axis=0)
df_dummies = df_dummies.dropna(how='all', axis=1)
df_dummies

fig = go.Figure(data=go.Heatmap(z=df_dummies, x=df_dummies.columns, y=df_dummies.columns))
fig.update_layout(title='Correlation Heatmap for Numeric Columns', yaxis_nticks=len(df_dummies.columns), height=1500, width=1500)
fig.show()

## Tag nulls

In [32]:
df['null_columns'] = df.isnull().apply(lambda x: df.columns[x].tolist(), axis=1)
df



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore,outlier_isolation_forest,outlier_dbscan,null_columns
0,32,1977-10-03,87.2,167.0,0.00,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,NaT,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,0,[recipientBodySurface],[recipientBodySurface],"[mismatch, id]",[id],"[donorWeight, donorHeight, donorBmi, donorDeat..."
1,25,1984-03-07,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[],[],[id],[id],"[praI, donorWeight, donorHeight, donorBmi, don..."
2,30,1979-06-21,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[],[id],"[donorFinalCpk, id]",[]
3,45,1964-06-28,,,0.00,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,2010-02-07,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,3,"[recipientBodySurface, praI]","[recipientBodySurface, praI]","[praI, id]",[id],"[recipientWeight, recipientHeight, donorWeight..."
4,55,NaT,94.0,176.0,2.10,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[],[],"[mismatch, id]",[id],"[recipientBirthdate, praI, donorWeight, donorH..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,1976-02-09,,,0.00,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,2020-05-09,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,1250,[recipientBodySurface],[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp..."
1251,34,1986-10-07,,,0.00,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,2020-09-09,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1251,[recipientBodySurface],[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp..."
1252,59,1961-07-10,,,0.00,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,2020-10-09,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1252,"[recipientBodySurface, coldIschemiaTimeMin]",[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp..."
1253,35,1985-04-09,,,0.00,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,2020-12-09,11.0,48,,,3.0,False,,1253,"[recipientBodySurface, praI, coldIschemiaTimeMin]",[recipientBodySurface],"[coldIschemiaTimeMin, id]",[id],"[recipientWeight, recipientHeight, dialysisTyp..."


In [33]:
# function for getting the dataset filtering out the null_columns
def get_dataset(df: pd.DataFrame, null_columns: List[str]):
    return df.dropna(subset=null_columns)

get_dataset(df, ['recipientBodySurface', 'donorAge', 'donorBmi', 'donorSex', 'inductionType'])

Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore,outlier_isolation_forest,outlier_dbscan,null_columns
2,30,1979-06-21,49.0,175.0,1.590,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.00,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[],[id],"[donorFinalCpk, id]",[]
5,26,1983-04-07,60.0,177.0,1.750,brown,male,hemodialise,hypertension,0.0,False,False,22.0,female,55.0,160.0,21.5,tce,False,False,135.0,35.0,0.40,True,False,2010-02-20,32.08,45,basiliximab (simulet),CNI+MMF+PRED,4.0,False,True,5,[coldIschemiaTimeMin],[],[id],"[donorFinalCpk, id]",[]
6,40,1970-11-02,43.5,143.0,1.300,brown,female,hemodialise,glomerulopathy,0.0,False,False,10.0,female,22.0,130.0,13.0,others,False,False,162.0,34742.0,1.50,True,False,2010-02-22,28.33,40,basiliximab (simulet),CNI+MMF+PRED,4.0,True,True,6,"[donorWeight, donorHeight, donorBmi, donorFina...","[donorWeight, donorHeight, donorFinalCpk]","[recipientHeight, donorAge, donorWeight, donor...","[donorWeight, donorHeight, donorBmi, donorFina...",[]
7,66,1944-02-28,70.0,158.0,1.720,brown,female,dialise_peritoneal,indeterminate,0.0,False,False,25.0,male,70.0,180.0,21.6,tce,False,False,142.0,1262.0,1.10,False,False,2010-02-23,21.92,55,basiliximab (simulet),CNI+AZA+PRED,1.0,False,False,7,[],[],[id],"[donorFinalCpk, id]",[]
8,34,1975-11-26,68.0,152.0,1.650,white,female,hemodialise,glomerulopathy,0.0,False,False,25.0,male,70.0,180.0,21.6,tce,False,False,142.0,1262.0,1.10,False,False,2010-02-23,26.00,85,basiliximab (simulet),CNI+MMF+PRED,1.0,False,False,8,[totalAnastomosisTimeMin],[],"[totalAnastomosisTimeMin, id]","[donorFinalCpk, id]",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1242,53,1968-01-16,59.0,156.0,1.581,white,female,hemodialise,indeterminate,8.0,False,False,24.0,female,53.0,149.0,24.0,tce,False,False,127.0,408.0,0.50,False,False,2020-08-19,16.00,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1242,[],[],"[donorHeight, donorFinalSodium, id]","[praI, donorHeight, donorFinalSodium, donorFin...",[delayedGraftFunction]
1243,44,1976-03-24,69.8,166.0,1.776,white,female,hemodialise,indeterminate,0.0,False,False,18.0,male,68.0,165.0,25.0,tce,False,False,151.0,541.0,0.80,True,False,2020-08-20,18.00,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1243,[],[],[id],"[donorFinalCpk, id]",[delayedGraftFunction]
1244,62,1958-07-26,75.3,164.0,1.818,white,male,hemodialise,hypertension,0.0,False,False,54.0,male,75.0,172.0,25.0,avci,False,False,164.0,514.0,1.50,False,False,2020-08-21,19.00,46,thymoglobulin,CNI+imTOR+PRED,3.0,False,False,1244,[],[],[id],"[donorFinalCpk, id]",[]
1246,64,1956-04-26,79.0,170.0,1.905,white,male,hemodialise,hypertension,0.0,False,False,21.0,male,80.0,175.0,26.0,tce,False,False,155.0,752.0,0.70,True,False,2020-08-22,24.00,45,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1246,[],[],[id],"[donorFinalCpk, id]",[delayedGraftFunction]


In [34]:
# filter outliers
df_no_outliers = df.copy()
for index, row in df.iterrows():
    if row['outlier_iqr']:
        df_no_outliers.drop(index, inplace=True)

df_no_outliers

Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore,outlier_isolation_forest,outlier_dbscan,null_columns
1,25,1984-03-07,46.6,156.0,1.430,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.60,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[],[],[id],[id],"[praI, donorWeight, donorHeight, donorBmi, don..."
2,30,1979-06-21,49.0,175.0,1.590,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.00,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[],[id],"[donorFinalCpk, id]",[]
4,55,NaT,94.0,176.0,2.100,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.60,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[],[],"[mismatch, id]",[id],"[recipientBirthdate, praI, donorWeight, donorH..."
7,66,1944-02-28,70.0,158.0,1.720,brown,female,dialise_peritoneal,indeterminate,0.0,False,False,25.0,male,70.0,180.0,21.6,tce,False,False,142.0,1262.0,1.10,False,False,2010-02-23,21.92,55,basiliximab (simulet),CNI+AZA+PRED,1.0,False,False,7,[],[],[id],"[donorFinalCpk, id]",[]
9,56,1953-01-05,57.0,165.0,1.620,white,female,hemodialise,diabetes,10.0,False,True,44.0,male,,,,,False,False,,,1.36,False,,2010-02-24,21.60,60,no_induction,CNI+MMF+PRED,3.0,True,True,9,[],[],[id],[id],"[donorWeight, donorHeight, donorBmi, donorDeat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,58,1962-11-22,69.6,167.0,1.782,brown,male,hemodialise,diabetes,0.0,False,False,50.0,female,65.0,155.0,27.0,others,True,False,179.0,302.0,1.30,False,False,2020-08-16,18.00,42,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1238,[],[],"[donorFinalSodium, id]","[donorFinalCpk, id]",[delayedGraftFunction]
1242,53,1968-01-16,59.0,156.0,1.581,white,female,hemodialise,indeterminate,8.0,False,False,24.0,female,53.0,149.0,24.0,tce,False,False,127.0,408.0,0.50,False,False,2020-08-19,16.00,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1242,[],[],"[donorHeight, donorFinalSodium, id]","[praI, donorHeight, donorFinalSodium, donorFin...",[delayedGraftFunction]
1243,44,1976-03-24,69.8,166.0,1.776,white,female,hemodialise,indeterminate,0.0,False,False,18.0,male,68.0,165.0,25.0,tce,False,False,151.0,541.0,0.80,True,False,2020-08-20,18.00,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1243,[],[],[id],"[donorFinalCpk, id]",[delayedGraftFunction]
1244,62,1958-07-26,75.3,164.0,1.818,white,male,hemodialise,hypertension,0.0,False,False,54.0,male,75.0,172.0,25.0,avci,False,False,164.0,514.0,1.50,False,False,2020-08-21,19.00,46,thymoglobulin,CNI+imTOR+PRED,3.0,False,False,1244,[],[],[id],"[donorFinalCpk, id]",[]


In [35]:
# get_dataset(df, df.columns).to_parquet('rejection.parquet')
get_dataset(df, []).to_parquet('rejection.parquet')

In [36]:
get_dataset(df, [])

Unnamed: 0,recipientAge,recipientBirthdate,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,transplantDate,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,id,outlier_iqr,outlier_zscore,outlier_isolation_forest,outlier_dbscan,null_columns
0,32,1977-10-03,87.2,167.0,0.00,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,NaT,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,0,[recipientBodySurface],[recipientBodySurface],"[mismatch, id]",[id],"[donorWeight, donorHeight, donorBmi, donorDeat..."
1,25,1984-03-07,46.6,156.0,1.43,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,2010-01-27,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,1,[],[],[id],[id],"[praI, donorWeight, donorHeight, donorBmi, don..."
2,30,1979-06-21,49.0,175.0,1.59,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.4,avci,True,False,146.0,181.0,0.87,True,False,2010-01-28,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,2,[],[],[id],"[donorFinalCpk, id]",[]
3,45,1964-06-28,,,0.00,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,2010-02-07,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,3,"[recipientBodySurface, praI]","[recipientBodySurface, praI]","[praI, id]",[id],"[recipientWeight, recipientHeight, donorWeight..."
4,55,NaT,94.0,176.0,2.10,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,2010-02-10,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,4,[],[],"[mismatch, id]",[id],"[recipientBirthdate, praI, donorWeight, donorH..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,1976-02-09,,,0.00,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,2020-05-09,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,1250,[recipientBodySurface],[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp..."
1251,34,1986-10-07,,,0.00,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,2020-09-09,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,1251,[recipientBodySurface],[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp..."
1252,59,1961-07-10,,,0.00,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,2020-10-09,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,1252,"[recipientBodySurface, coldIschemiaTimeMin]",[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp..."
1253,35,1985-04-09,,,0.00,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,2020-12-09,11.0,48,,,3.0,False,,1253,"[recipientBodySurface, praI, coldIschemiaTimeMin]",[recipientBodySurface],"[coldIschemiaTimeMin, id]",[id],"[recipientWeight, recipientHeight, dialysisTyp..."


## Feature engineering

Body Surface experiment
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4250987/

In [37]:
# BSA [m2] = Weight [kg]0.425 × height (cm)0.725 × 0.007184
df['recipientBodySurface'] = df['recipientWeight'] ** 0.425 * df['recipientHeight'] ** 0.725 * 0.007184
df['donorBodySurface'] = df['donorWeight'] ** 0.425 * df['donorHeight'] ** 0.725 * 0.007184
df['recipientBmi'] = df['recipientWeight'] / (df['recipientHeight'] / 100) ** 2
df['donorBmi'] = df['donorWeight'] / (df['donorHeight'] / 100) ** 2



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [38]:
drop_cols = ['recipientBirthdate', 'transplantDate', 'id']
df.drop(columns=drop_cols, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [39]:
df

Unnamed: 0,recipientAge,recipientWeight,recipientHeight,recipientBodySurface,recipientColor,recipientSex,dialysisType,underlyingDisease,praI,retransplant,isDonorAlive,donorAge,donorSex,donorWeight,donorHeight,donorBmi,donorDeathCause,donorHypertension,donorDiabetesMellitus,donorFinalSodium,donorFinalCpk,donorFinalCreatinine,donorUsedVasoactiveDrug,donorHepatitisCVirus,coldIschemiaTimeMin,totalAnastomosisTimeMin,inductionType,initialImmunization,mismatch,rejected,delayedGraftFunction,outlier_iqr,outlier_zscore,outlier_isolation_forest,outlier_dbscan,null_columns,donorBodySurface,recipientBmi
0,32,87.2,167.0,1.961351,brown,female,hemodialise,hypertension,0.0,False,True,26.0,male,,,,,False,False,,,1.36,False,,21.6,35,basiliximab (simulet),CNI+MMF+PRED,6.0,True,False,[recipientBodySurface],[recipientBodySurface],"[mismatch, id]",[id],"[donorWeight, donorHeight, donorBmi, donorDeat...",,31.266808
1,25,46.6,156.0,1.430361,white,female,hemodialise,glomerulopathy,,False,True,29.0,male,,,,,False,False,,,1.36,False,,21.6,35,no_induction,CNI+MMF+PRED,3.0,True,False,[],[],[id],[id],"[praI, donorWeight, donorHeight, donorBmi, don...",,19.148586
2,30,49.0,175.0,1.588189,white,female,hemodialise,glomerulopathy,0.0,False,False,50.0,male,75.0,172.0,25.351541,avci,True,False,146.0,181.0,0.87,True,False,22.0,65,basiliximab (simulet),CNI+MMF+PRED,3.0,False,False,[],[],[id],"[donorFinalCpk, id]",[],1.879427,16.000000
3,45,,,,brown,female,hemodialise,glomerulopathy,98.0,True,False,18.0,female,,,,,False,False,,,1.36,False,,21.6,48,thymoglobulin,CNI+MMF+PRED,0.0,False,False,"[recipientBodySurface, praI]","[recipientBodySurface, praI]","[praI, id]",[id],"[recipientWeight, recipientHeight, donorWeight...",,
4,55,94.0,176.0,2.103499,white,male,hemodialise,diabetes,,False,True,45.0,female,,,,,False,False,,,1.36,False,,21.6,50,no_induction,CNI+MMF+PRED,6.0,True,False,[],[],"[mismatch, id]",[id],"[recipientBirthdate, praI, donorWeight, donorH...",,30.346074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1250,44,,,,brown,male,,hypertension,0.0,,False,19.0,,,,,tce,False,False,,,1.10,False,,19.0,48,thymoglobulin,CNI+imTOR+PRED,2.0,False,,[recipientBodySurface],[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp...",,
1251,34,,,,white,female,,glomerulopathy,0.0,,False,35.0,,,,,tce,False,False,,,1.90,False,,22.0,48,thymoglobulin,CNI+imTOR+PRED,4.0,False,,[recipientBodySurface],[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp...",,
1252,59,,,,white,male,,hypertension,0.0,,False,35.0,,,,,avci,False,False,,,1.67,True,,13.0,48,thymoglobulin,CNI+imTOR+PRED,3.0,False,,"[recipientBodySurface, coldIschemiaTimeMin]",[recipientBodySurface],[id],[id],"[recipientWeight, recipientHeight, dialysisTyp...",,
1253,35,,,,white,female,,indeterminate,83.0,,False,27.0,,,,,avci,False,False,,,0.80,False,,11.0,48,,,3.0,False,,"[recipientBodySurface, praI, coldIschemiaTimeMin]",[recipientBodySurface],"[coldIschemiaTimeMin, id]",[id],"[recipientWeight, recipientHeight, dialysisTyp...",,


## Save output

In [40]:
df.to_parquet('rejection.parquet')