# Modelagem

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

import numpy as np

In [2]:
df = pd.read_csv('waitlist_kidney_brazil_prepared.csv', encoding='iso-8859-1')

In [3]:
df.head()

Unnamed: 0,date_registered,age_registered,dialysis_session_count,sex,underlying_disease,diabetes,chagas,blood_type,transfusion_count,gestation,prior_transplant,c_pra,hla_a1,hla_a2,hla_b1,hla_b2,hla_dr1,hla_dr2,dr_00,b_00,a_00,anti_hbc,anti_hcv,hbs_ag,event,days_waiting
0,2017-06-29,67,1.0,M,other,True,False,A,0,False,False,0,1,26,44,51,3,7,heterozygous,heterozygous,heterozygous,False,False,False,waiting,392
1,2008-11-03,58,4.0,M,diabetes,False,False,A,0,False,False,0,1,24,18,35,11,0,homozygous,heterozygous,heterozygous,False,False,False,died_waiting,2066
2,2010-07-13,51,2.0,M,hypertension,True,False,O,0,False,False,64,24,25,14,18,1,15,heterozygous,heterozygous,heterozygous,False,False,False,removed,365
3,2011-10-03,52,17.0,M,diabetes,False,False,O,0,False,True,2,24,25,14,18,1,15,heterozygous,heterozygous,heterozygous,False,False,False,removed,365
4,2006-07-05,67,68.0,M,hypertension,True,False,A,0,False,False,0,24,68,14,27,13,15,heterozygous,heterozygous,heterozygous,False,False,False,died_waiting,194


## Limpeza dos dados

In [4]:
df_clean = df.drop(columns=['date_registered'])

In [5]:
df_clean['event'].value_counts()

event
removed         14356
transplanted    13732
waiting         10933
died_waiting     9132
Name: count, dtype: int64

Removemos os dados com evento `waiting` ou `removed`, porque representam casos em que um transplante não aconteceu.

In [6]:
# df_clean = df_clean[df_clean['event'] == 'transplanted']
# df_clean['event'].value_counts()

Removemos a coluna de evento, que passa a ser redundante.

In [7]:
# df_clean = df_clean.drop(columns=['event'])

## Engenharia de Features

In [8]:
df_onehot = pd.get_dummies(df_clean, columns=['sex', 'underlying_disease', 'blood_type', 'dr_00', 'b_00', 'a_00'], drop_first=True)
df_onehot.head()

Unnamed: 0,age_registered,dialysis_session_count,diabetes,chagas,transfusion_count,gestation,prior_transplant,c_pra,hla_a1,hla_a2,hla_b1,hla_b2,hla_dr1,hla_dr2,anti_hbc,anti_hcv,hbs_ag,event,days_waiting,sex_M,underlying_disease_glomerulonephritis,underlying_disease_hypertension,underlying_disease_other,underlying_disease_pyelonephritis,blood_type_AB,blood_type_B,blood_type_O,dr_00_homozygous,b_00_homozygous,a_00_homozygous
0,67,1.0,True,False,0,False,False,0,1,26,44,51,3,7,False,False,False,waiting,392,True,False,False,True,False,False,False,False,False,False,False
1,58,4.0,False,False,0,False,False,0,1,24,18,35,11,0,False,False,False,died_waiting,2066,True,False,False,False,False,False,False,False,True,False,False
2,51,2.0,True,False,0,False,False,64,24,25,14,18,1,15,False,False,False,removed,365,True,False,True,False,False,False,False,True,False,False,False
3,52,17.0,False,False,0,False,True,2,24,25,14,18,1,15,False,False,False,removed,365,True,False,False,False,False,False,False,True,False,False,False
4,67,68.0,True,False,0,False,False,0,24,68,14,27,13,15,False,False,False,died_waiting,194,True,False,True,False,False,False,False,False,False,False,False


In [9]:
target = 'days_waiting'

In [10]:
X, y = df_onehot.drop(columns=[target]), df_onehot[target]

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729)

## Métricas de avaliação

In [12]:
from sklearn.metrics import mean_squared_error, explained_variance_score

## Modelos

### Risk-minimization

Modelo de SVR.

In [13]:
# from sklearn.svm import SVR
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler

# regr = make_pipeline(StandardScaler(), SVR(kernel='linear', max_iter=1000000))
# regr.fit(X_train, y_train)

In [14]:
# y_pred = regr.predict(X_test)

In [15]:
# y_pred

In [16]:
# mean_squared_error(y_test, y_pred)

In [17]:
# explained_variance_score(y_test, y_pred)

In [18]:
!pip3 install lifelines

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ifoodcorp.com.br

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [19]:
from lifelines.fitters.coxph_fitter import CoxPHFitter

In [20]:
df_cph = df_onehot.copy()
df_cph['event'] = df_cph['event'] == 'transplanted'
df_cph.drop(columns=['chagas', 'diabetes'], inplace=True)

In [21]:
cph = CoxPHFitter()

cph.fit(df_cph, 'days_waiting', 'event', show_progress=True)
cph.print_summary()

Iteration 1: norm_delta = 6.03e-01, step_size = 0.9500, log_lik = -137423.72662, newton_decrement = 2.32e+03, seconds_since_start = 0.2
Iteration 2: norm_delta = 1.67e-01, step_size = 0.9500, log_lik = -135003.43127, newton_decrement = 1.38e+02, seconds_since_start = 0.3
Iteration 3: norm_delta = 4.08e-02, step_size = 0.9500, log_lik = -134854.71341, newton_decrement = 5.79e+00, seconds_since_start = 0.5
Iteration 4: norm_delta = 2.27e-03, step_size = 1.0000, log_lik = -134848.74987, newton_decrement = 1.48e-02, seconds_since_start = 0.6
Iteration 5: norm_delta = 7.73e-06, step_size = 1.0000, log_lik = -134848.73502, newton_decrement = 1.68e-07, seconds_since_start = 0.8
Iteration 6: norm_delta = 9.09e-11, step_size = 1.0000, log_lik = -134848.73502, newton_decrement = 2.32e-17, seconds_since_start = 0.9
Convergence success after 6 iterations.


0,1
model,lifelines.CoxPHFitter
duration col,'days_waiting'
event col,'event'
baseline estimation,breslow
number of observations,48153
number of events observed,13732
partial log-likelihood,-134848.74
time fit was run,2023-10-15 20:54:58 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
age_registered,-0.02,0.98,0.0,-0.02,-0.02,0.98,0.98,0.0,-35.41,<0.005,910.02
dialysis_session_count,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.65,0.01,6.96
transfusion_count,0.06,1.06,0.01,0.04,0.09,1.04,1.09,0.0,4.5,<0.005,17.18
gestation,0.15,1.17,0.03,0.1,0.21,1.1,1.23,0.0,5.34,<0.005,23.36
prior_transplant,-0.08,0.92,0.03,-0.14,-0.02,0.87,0.98,0.0,-2.74,0.01,7.33
c_pra,-0.01,0.99,0.0,-0.01,-0.01,0.99,0.99,0.0,-33.58,<0.005,818.88
hla_a1,-0.0,1.0,0.0,-0.01,-0.0,0.99,1.0,0.0,-5.89,<0.005,27.94
hla_a2,-0.0,1.0,0.0,-0.0,-0.0,1.0,1.0,0.0,-7.49,<0.005,43.75
hla_b1,-0.0,1.0,0.0,-0.01,-0.0,0.99,1.0,0.0,-7.65,<0.005,45.49
hla_b2,-0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,-1.84,0.07,3.92

0,1
Concordance,0.67
Partial AIC,269749.47
log-likelihood ratio test,5149.98 on 26 df
-log2(p) of ll-ratio test,inf
