In [91]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder


from sksurv.preprocessing import OneHotEncoder
from sksurv.ensemble import RandomSurvivalForest

# Preparado de Datos con Imputaciones de outliers y missing values

En este caso se trabaja con el archivo de datos tras el procesado de valores faltantes y outliers tal como se explica en la memoria.

In [92]:
Data=pd.read_csv('df_con_imputaciones.csv')
Data.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1,1.771485
1,LUNG1-002,83.8001,2.0,0,0,I,squamous cell carcinoma,male,155,1,1.881307
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1,1.376259
3,LUNG1-004,70.8802,2.0,1,0,II,squamous cell carcinoma,male,141,1,1.73471
4,LUNG1-005,80.4819,4.0,2,0,IIIb,squamous cell carcinoma,male,353,1,1.581113


In [93]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         421 non-null    object 
 1   age               421 non-null    float64
 2   clinical.T.Stage  421 non-null    float64
 3   Clinical.N.Stage  421 non-null    int64  
 4   Clinical.M.Stage  421 non-null    int64  
 5   Overall.Stage     421 non-null    object 
 6   Histology         421 non-null    object 
 7   gender            421 non-null    object 
 8   Survival.time     421 non-null    int64  
 9   deadstatus.event  421 non-null    int64  
 10  F.analysis        421 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 36.3+ KB


## Eliminación valores erróneos de Stage

In [94]:
Datos_NoNulos=Data.copy()
Datos_NoNulos.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1,1.771485
1,LUNG1-002,83.8001,2.0,0,0,I,squamous cell carcinoma,male,155,1,1.881307
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1,1.376259
3,LUNG1-004,70.8802,2.0,1,0,II,squamous cell carcinoma,male,141,1,1.73471
4,LUNG1-005,80.4819,4.0,2,0,IIIb,squamous cell carcinoma,male,353,1,1.581113


### Clinical T Stage

In [95]:
T_rows = Datos_NoNulos[Datos_NoNulos['clinical.T.Stage'] > 4]
T_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
55,LUNG1-056,68.0365,5.0,2,0,IIIa,squamous cell carcinoma,female,547,0,1.672505


In [96]:
Datos_NoNulos=Datos_NoNulos.drop([55], axis=0, inplace=False)
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         420 non-null    object 
 1   age               420 non-null    float64
 2   clinical.T.Stage  420 non-null    float64
 3   Clinical.N.Stage  420 non-null    int64  
 4   Clinical.M.Stage  420 non-null    int64  
 5   Overall.Stage     420 non-null    object 
 6   Histology         420 non-null    object 
 7   gender            420 non-null    object 
 8   Survival.time     420 non-null    int64  
 9   deadstatus.event  420 non-null    int64  
 10  F.analysis        420 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 39.4+ KB


### Clinical N Stage

In [97]:
N_rows = Datos_NoNulos[Datos_NoNulos['Clinical.N.Stage'] > 3]
N_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
166,LUNG1-167,74.9295,4.0,4,0,IIIb,squamous cell carcinoma,male,1357,1,0.872282
231,LUNG1-232,73.2868,4.0,4,0,IIIb,large cell,male,2521,1,1.588498
290,LUNG1-292,66.2149,4.0,4,0,IIIb,squamous cell carcinoma,male,232,1,1.671252


In [98]:
Datos_NoNulos=Datos_NoNulos.drop([166], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([231], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([290], axis=0, inplace=False)
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 417 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         417 non-null    object 
 1   age               417 non-null    float64
 2   clinical.T.Stage  417 non-null    float64
 3   Clinical.N.Stage  417 non-null    int64  
 4   Clinical.M.Stage  417 non-null    int64  
 5   Overall.Stage     417 non-null    object 
 6   Histology         417 non-null    object 
 7   gender            417 non-null    object 
 8   Survival.time     417 non-null    int64  
 9   deadstatus.event  417 non-null    int64  
 10  F.analysis        417 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 39.1+ KB


### Clinical M Stage

In [99]:
M_rows = Datos_NoNulos[Datos_NoNulos['Clinical.M.Stage'] >2]
M_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
71,LUNG1-072,71.4743,4.0,3,3,IIIb,nos,male,377,1,1.542416
255,LUNG1-256,53.0842,4.0,2,3,IIIb,large cell,male,291,1,1.658656
268,LUNG1-269,73.0595,3.0,3,3,IIIa,large cell,male,193,1,1.382715
331,LUNG1-333,63.6988,1.0,2,3,IIIa,adenocarcinoma,male,2985,1,1.223261


In [100]:
Datos_NoNulos=Datos_NoNulos.drop([71], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([255], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([268], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([331], axis=0, inplace=False)
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         413 non-null    object 
 1   age               413 non-null    float64
 2   clinical.T.Stage  413 non-null    float64
 3   Clinical.N.Stage  413 non-null    int64  
 4   Clinical.M.Stage  413 non-null    int64  
 5   Overall.Stage     413 non-null    object 
 6   Histology         413 non-null    object 
 7   gender            413 non-null    object 
 8   Survival.time     413 non-null    int64  
 9   deadstatus.event  413 non-null    int64  
 10  F.analysis        413 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 38.7+ KB


De este modo se elimina un 2.13 % de los datos, entre los datos no imputados (1) y erróneos (8)

## Valores erróneos Histología

### Histology nos
Datos sin Histología definida

In [101]:
H_rows = Datos_NoNulos[Datos_NoNulos['Histology'] =='nos']
H_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
12,LUNG1-013,65.3635,2.0,0,0,I,nos,male,547,1,1.278645
15,LUNG1-016,79.1129,2.0,0,0,I,nos,male,101,1,1.799757
18,LUNG1-019,74.8200,2.0,0,0,I,nos,male,336,1,1.682059
50,LUNG1-051,51.6906,2.0,0,0,I,nos,male,210,1,1.693693
56,LUNG1-057,74.9076,2.0,2,0,IIIa,nos,male,98,1,1.688905
...,...,...,...,...,...,...,...,...,...,...,...
329,LUNG1-331,69.8645,3.0,2,0,IIIa,nos,male,549,1,1.854966
342,LUNG1-344,67.7837,4.0,0,0,IIIb,nos,male,587,1,1.747037
346,LUNG1-348,62.4641,4.0,2,0,IIIb,nos,female,1672,1,1.792885
348,LUNG1-350,77.4018,2.0,3,0,IIIb,nos,female,444,1,1.454578


In [102]:
H_rows.count()

PatientID           62
age                 62
clinical.T.Stage    62
Clinical.N.Stage    62
Clinical.M.Stage    62
Overall.Stage       62
Histology           62
gender              62
Survival.time       62
deadstatus.event    62
F.analysis          62
dtype: int64

Si se eliminaran estos pacientes de Histología estaríamos hablando de una reducción del 16,82% de los datos por lo que inicialmente se dejan. Se evaluará el rendimiento del modelo con y sin esa columna, para contar con las mismas condiciones que en el caso del modelo sin imputaciones

##  Encoding de datos categóricos

### Overall.Stage

In [103]:
 from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

In [104]:
Datos_NoNulos['Overall.Stage']= enc.fit_transform(Datos_NoNulos[['Overall.Stage']])

In [105]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         413 non-null    object 
 1   age               413 non-null    float64
 2   clinical.T.Stage  413 non-null    float64
 3   Clinical.N.Stage  413 non-null    int64  
 4   Clinical.M.Stage  413 non-null    int64  
 5   Overall.Stage     413 non-null    float64
 6   Histology         413 non-null    object 
 7   gender            413 non-null    object 
 8   Survival.time     413 non-null    int64  
 9   deadstatus.event  413 non-null    int64  
 10  F.analysis        413 non-null    float64
dtypes: float64(4), int64(4), object(3)
memory usage: 38.7+ KB


### Histology

In [106]:
dummies = pd.get_dummies(Datos_NoNulos['Histology'])
dummies

Unnamed: 0,adenocarcinoma,large cell,nos,squamous cell carcinoma
0,0,1,0,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
416,1,0,0,0
417,0,0,0,1
418,0,0,0,1
419,0,0,0,1


In [107]:
Datos_NoNulos = pd.concat([
    Datos_NoNulos.drop("Histology", axis = 1),
    dummies
], axis = 1)
Datos_NoNulos.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,gender,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma
0,LUNG1-001,78.7515,2.0,3,0,3.0,male,2165,1,1.771485,0,1,0,0
1,LUNG1-002,83.8001,2.0,0,0,0.0,male,155,1,1.881307,0,0,0,1
2,LUNG1-003,68.1807,2.0,3,0,3.0,male,256,1,1.376259,0,1,0,0
3,LUNG1-004,70.8802,2.0,1,0,1.0,male,141,1,1.73471,0,0,0,1
4,LUNG1-005,80.4819,4.0,2,0,3.0,male,353,1,1.581113,0,0,0,1


In [108]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PatientID                413 non-null    object 
 1   age                      413 non-null    float64
 2   clinical.T.Stage         413 non-null    float64
 3   Clinical.N.Stage         413 non-null    int64  
 4   Clinical.M.Stage         413 non-null    int64  
 5   Overall.Stage            413 non-null    float64
 6   gender                   413 non-null    object 
 7   Survival.time            413 non-null    int64  
 8   deadstatus.event         413 non-null    int64  
 9   F.analysis               413 non-null    float64
 10  adenocarcinoma           413 non-null    uint8  
 11  large cell               413 non-null    uint8  
 12  nos                      413 non-null    uint8  
 13  squamous cell carcinoma  413 non-null    uint8  
dtypes: float64(4), int64(4), o

### gender

In [109]:
dummies = pd.get_dummies(Datos_NoNulos['gender'])
dummies

Unnamed: 0,female,male
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
416,0,1
417,0,1
418,0,1
419,1,0


In [110]:
Datos_NoNulos = pd.concat([
    Datos_NoNulos.drop("gender", axis = 1),
    dummies
], axis = 1)
Datos_NoNulos.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma,female,male
0,LUNG1-001,78.7515,2.0,3,0,3.0,2165,1,1.771485,0,1,0,0,0,1
1,LUNG1-002,83.8001,2.0,0,0,0.0,155,1,1.881307,0,0,0,1,0,1
2,LUNG1-003,68.1807,2.0,3,0,3.0,256,1,1.376259,0,1,0,0,0,1
3,LUNG1-004,70.8802,2.0,1,0,1.0,141,1,1.73471,0,0,0,1,0,1
4,LUNG1-005,80.4819,4.0,2,0,3.0,353,1,1.581113,0,0,0,1,0,1


In [111]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PatientID                413 non-null    object 
 1   age                      413 non-null    float64
 2   clinical.T.Stage         413 non-null    float64
 3   Clinical.N.Stage         413 non-null    int64  
 4   Clinical.M.Stage         413 non-null    int64  
 5   Overall.Stage            413 non-null    float64
 6   Survival.time            413 non-null    int64  
 7   deadstatus.event         413 non-null    int64  
 8   F.analysis               413 non-null    float64
 9   adenocarcinoma           413 non-null    uint8  
 10  large cell               413 non-null    uint8  
 11  nos                      413 non-null    uint8  
 12  squamous cell carcinoma  413 non-null    uint8  
 13  female                   413 non-null    uint8  
 14  male                     4

## Normalización de los datos

In [112]:
df_normalizar=Datos_NoNulos[["age" , "clinical.T.Stage","Clinical.N.Stage","Clinical.M.Stage","Overall.Stage", "F.analysis"]]
col_norm=df_normalizar.columns
col_norm

Index(['age', 'clinical.T.Stage', 'Clinical.N.Stage', 'Clinical.M.Stage',
       'Overall.Stage', 'F.analysis'],
      dtype='object')

In [113]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(df_normalizar)

In [114]:
scaled_df = pd.DataFrame(scaled_df, columns=col_norm)

In [115]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413 entries, 0 to 412
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               413 non-null    float64
 1   clinical.T.Stage  413 non-null    float64
 2   Clinical.N.Stage  413 non-null    float64
 3   Clinical.M.Stage  413 non-null    float64
 4   Overall.Stage     413 non-null    float64
 5   F.analysis        413 non-null    float64
dtypes: float64(6)
memory usage: 19.5 KB


In [116]:
filas_nonulos=list()
filas_norm=list()
for indice_fila,fila in Datos_NoNulos.iterrows():
      filas_nonulos.append(indice_fila)
for indice_fil_norm, fila_norm in scaled_df.iterrows():
    filas_norm.append(indice_fil_norm)
    

In [117]:
filas_nonulos

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187

In [118]:
filas_norm

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [119]:
for i,j in zip(filas_nonulos,filas_norm):
    Datos_NoNulos.at[i,'age']=scaled_df.at[j,'age']
    Datos_NoNulos.at[i,'clinical.T.Stage']=scaled_df.at[j,'clinical.T.Stage']
    Datos_NoNulos.at[i,'Clinical.N.Stage']=scaled_df.at[j,'Clinical.N.Stage']
    Datos_NoNulos.at[i,'Clinical.M.Stage']=scaled_df.at[j,'Clinical.M.Stage']
    Datos_NoNulos.at[i,'Overall.Stage']=scaled_df.at[j,'Overall.Stage']
    Datos_NoNulos.at[i,'F.analysis']=scaled_df.at[j,'F.analysis']
   
    


In [120]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PatientID                413 non-null    object 
 1   age                      413 non-null    float64
 2   clinical.T.Stage         413 non-null    float64
 3   Clinical.N.Stage         413 non-null    int64  
 4   Clinical.M.Stage         413 non-null    int64  
 5   Overall.Stage            413 non-null    float64
 6   Survival.time            413 non-null    int64  
 7   deadstatus.event         413 non-null    int64  
 8   F.analysis               413 non-null    float64
 9   adenocarcinoma           413 non-null    uint8  
 10  large cell               413 non-null    uint8  
 11  nos                      413 non-null    uint8  
 12  squamous cell carcinoma  413 non-null    uint8  
 13  female                   413 non-null    uint8  
 14  male                     4

In [121]:
Datos_NoNulos.describe()

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma,female,male
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,0.520965,0.48184,0.200969,0.002421,0.622276,746.961259,0.883777,0.566964,0.121065,0.266344,0.150121,0.46247,0.317191,0.682809
std,0.197697,0.370873,0.401211,0.049207,0.394021,679.767413,0.320881,0.195464,0.326599,0.442582,0.357623,0.499194,0.465947,0.465947
min,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.390216,0.333333,0.0,0.0,0.333333,261.0,1.0,0.428128,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.518859,0.333333,0.0,0.0,0.666667,547.0,1.0,0.5784,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.670452,1.0,0.0,0.0,1.0,1013.0,1.0,0.717933,0.0,1.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,3040.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [122]:
Datos_NoNulos

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma,female,male
0,LUNG1-001,0.736684,0.333333,1,0,1.000000,2165,1,0.770741,0,1,0,0,0,1
1,LUNG1-002,0.839316,0.333333,0,0,0.000000,155,1,0.842723,0,0,0,1,0,1
2,LUNG1-003,0.521791,0.333333,1,0,1.000000,256,1,0.511693,0,1,0,0,0,1
3,LUNG1-004,0.576668,0.333333,0,0,0.333333,141,1,0.746637,0,0,0,1,0,1
4,LUNG1-005,0.771861,1.000000,0,0,1.000000,353,1,0.645963,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,LUNG1-418,0.226828,0.333333,0,0,0.000000,346,1,0.686210,1,0,0,0,0,1
417,LUNG1-419,0.487819,1.000000,0,0,1.000000,2772,0,0.665886,0,0,0,1,0,1
418,LUNG1-420,0.627503,0.333333,0,0,0.333333,2429,1,0.514863,0,0,0,1,0,1
419,LUNG1-421,0.390128,0.333333,0,0,0.666667,369,1,0.622309,0,0,0,1,1,0


## CONVERSION DATAFRAME A CSV

In [123]:
Datos_NoNulos.to_csv('df_con_imputaciones_norm.csv')

## MODELO RANDOM SURVIVAL FOREST

In [124]:
X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis','adenocarcinoma','nos','squamous cell carcinoma','large cell','Overall.Stage','female','male']]

In [125]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      413 non-null    float64
 1   clinical.T.Stage         413 non-null    float64
 2   Clinical.N.Stage         413 non-null    int64  
 3   Clinical.M.Stage         413 non-null    int64  
 4   F.analysis               413 non-null    float64
 5   adenocarcinoma           413 non-null    uint8  
 6   nos                      413 non-null    uint8  
 7   squamous cell carcinoma  413 non-null    uint8  
 8   large cell               413 non-null    uint8  
 9   Overall.Stage            413 non-null    float64
 10  female                   413 non-null    uint8  
 11  male                     413 non-null    uint8  
dtypes: float64(4), int64(2), uint8(6)
memory usage: 41.2 KB


In [126]:
Y_df=Datos_NoNulos[['deadstatus.event','Survival.time']]

In [127]:
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  413 non-null    int64
 1   Survival.time     413 non-null    int64
dtypes: int64(2)
memory usage: 25.8 KB


In [128]:
Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  413 non-null    bool 
 1   Survival.time     413 non-null    int64
dtypes: bool(1), int64(1)
memory usage: 23.0 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')


In [129]:
Y=Y_df.to_records(index=False)
Y

rec.array([( True, 2165), ( True,  155), ( True,  256), ( True,  141),
           ( True,  353), ( True,  173), ( True,  137), ( True,   77),
           ( True,  131), (False, 2119), ( True,  515), ( True,   85),
           ( True,  547), ( True, 1247), ( True, 1238), ( True,  101),
           ( True,  220), ( True, 1926), ( True,  336), ( True,  139),
           ( True,  326), ( True,  442), ( True,  245), ( True, 1141),
           ( True, 1883), ( True,   25), (False, 1972), ( True,  479),
           ( True,  257), ( True,  303), ( True,  999), ( True,  543),
           ( True,  456), ( True,  597), ( True,   98), ( True,  366),
           ( True,  464), ( True,  370), ( True,  342), ( True,  558),
           ( True,  136), ( True,  134), ( True,  183), ( True,  170),
           ( True, 1070), ( True,   73), (False, 1810), (False,  547),
           ( True, 1670), ( True,  208), ( True,  210), ( True,   73),
           ( True,   78), ( True, 1076), ( True,  192), ( True,   98),
      

In [130]:
random_state = 20

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=random_state)

In [131]:
rsf = RandomSurvivalForest(
    n_estimators=1000, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=random_state
)
rsf.fit(X_train, y_train)

In [132]:
rsf.score(X_test, y_test)

0.5413139862958485

In [133]:
from sklearn.inspection import permutation_importance

result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=random_state)

In [134]:
pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

Unnamed: 0,importances_mean,importances_std
F.analysis,0.027334,0.037024
age,0.012065,0.014885
clinical.T.Stage,0.003641,0.004359
Clinical.N.Stage,0.001706,0.003504
Overall.Stage,0.001223,0.002479
nos,0.000134,0.001144
adenocarcinoma,1.3e-05,0.000316
Clinical.M.Stage,0.0,0.0
large cell,-0.00043,0.00165
male,-0.000927,0.003692


In [135]:
X_test_sorted = X_test.sort_values(by=[ "F.analysis","age"])
X_test_sel = pd.concat((X_test_sorted.head(3), X_test_sorted.tail(3)))

X_test_sel

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,F.analysis,adenocarcinoma,nos,squamous cell carcinoma,large cell,Overall.Stage,female,male
97,0.886904,0.333333,0,0,0.0,0,0,1,0,0.0,0,1
177,0.379752,0.666667,1,0,0.206167,0,0,0,1,1.0,0,1
381,0.503525,0.0,1,0,0.221336,0,0,1,0,0.666667,0,1
64,0.68765,0.333333,1,0,0.90968,0,1,0,0,1.0,0,1
394,0.338442,1.0,0,0,0.946216,0,0,1,0,1.0,1,0
195,0.865978,0.333333,0,0,1.0,0,0,0,1,0.666667,1,0


In [136]:
pd.Series(rsf.predict(X_test_sel))

0    243.771368
1    185.422091
2    187.305233
3    327.964662
4    264.941734
5    301.919178
dtype: float64

 ### Modelo RSF quitando columnas

In [137]:
#X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis','adenocarcinoma','nos','squamous cell carcinoma','large cell','Overall.Stage']]
#X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis','Overall.Stage']]
#X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis']]
#X=Datos_NoNulos[['age','F.analysis']]
X=Datos_NoNulos[['age','F.analysis','clinical.T.Stage']]
#X=Datos_NoNulos[['age']]
#X=Datos_NoNulos[['F.analysis']]

In [138]:
Y_df=Datos_NoNulos[['deadstatus.event','Survival.time']]
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  413 non-null    int64
 1   Survival.time     413 non-null    int64
dtypes: int64(2)
memory usage: 25.8 KB


In [139]:
Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  413 non-null    bool 
 1   Survival.time     413 non-null    int64
dtypes: bool(1), int64(1)
memory usage: 23.0 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')


In [140]:
Y=Y_df.to_records(index=False)
#Y

In [141]:
random_state = 20

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=random_state)

In [142]:
rsf = RandomSurvivalForest(
    n_estimators=1000, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=random_state
)
rsf.fit(X_train, y_train)

In [143]:
rsf.score(X_test, y_test)

0.5630794034663442

In [144]:
from sklearn.inspection import permutation_importance

result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=random_state)

In [145]:
pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

Unnamed: 0,importances_mean,importances_std
F.analysis,0.031311,0.037168
age,0.018192,0.019098
clinical.T.Stage,0.006892,0.010816


In [146]:
X_test_sorted = X_test.sort_values(by=[ "F.analysis"])
X_test_sel = pd.concat((X_test_sorted.head(3), X_test_sorted.tail(3)))

X_test_sel

Unnamed: 0,age,F.analysis,clinical.T.Stage
97,0.886904,0.0,0.333333
177,0.379752,0.206167,0.666667
381,0.503525,0.221336,0.0
64,0.68765,0.90968,0.333333
394,0.338442,0.946216,1.0
195,0.865978,1.0,0.333333


In [147]:
pd.Series(rsf.predict(X_test_sel))

0    245.130937
1    159.998832
2    120.491748
3    371.229338
4    294.715408
5    375.474032
dtype: float64