In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder


from sksurv.preprocessing import OneHotEncoder
from sksurv.ensemble import RandomSurvivalForest

# Preparado de Datos Sin Imputaciones

En este caso se trabaja con el archivo de datos sin procesar, es decir, los datos descargados directamente de la base de datos, simplemente con una columna a mayores : F.analisis, que se corresponde con la dimensión fractal calculada previamente en MATLAB.

In [73]:
Data=pd.read_csv('df_sin_imputaciones.csv')
Data.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1,1.771485
1,LUNG1-002,83.8001,2.0,0,0,I,squamous cell carcinoma,male,155,1,1.881307
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1,1.376259
3,LUNG1-004,70.8802,2.0,1,0,II,squamous cell carcinoma,male,141,1,1.73471
4,LUNG1-005,80.4819,4.0,2,0,IIIb,squamous cell carcinoma,male,353,1,1.581113


In [74]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422 entries, 0 to 421
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         422 non-null    object 
 1   age               400 non-null    float64
 2   clinical.T.Stage  421 non-null    float64
 3   Clinical.N.Stage  422 non-null    int64  
 4   Clinical.M.Stage  422 non-null    int64  
 5   Overall.Stage     421 non-null    object 
 6   Histology         380 non-null    object 
 7   gender            422 non-null    object 
 8   Survival.time     422 non-null    int64  
 9   deadstatus.event  422 non-null    int64  
 10  F.analysis        421 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 36.4+ KB


## Eliminación datos faltantes

In [75]:
Datos_NoNulos=Data.dropna()


In [76]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 364 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         364 non-null    object 
 1   age               364 non-null    float64
 2   clinical.T.Stage  364 non-null    float64
 3   Clinical.N.Stage  364 non-null    int64  
 4   Clinical.M.Stage  364 non-null    int64  
 5   Overall.Stage     364 non-null    object 
 6   Histology         364 non-null    object 
 7   gender            364 non-null    object 
 8   Survival.time     364 non-null    int64  
 9   deadstatus.event  364 non-null    int64  
 10  F.analysis        364 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 34.1+ KB


## Eliminación valores erróneos de Stage

### Clinical T Stage

In [77]:
T_rows = Datos_NoNulos[Datos_NoNulos['clinical.T.Stage'] > 4]
T_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis


### Clinical N Stage

In [78]:
N_rows = Datos_NoNulos[Datos_NoNulos['Clinical.N.Stage'] > 3]
N_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
166,LUNG1-167,74.9295,4.0,4,0,IIIb,squamous cell carcinoma,male,1357,1,0.872282
231,LUNG1-232,73.2868,4.0,4,0,IIIb,large cell,male,2521,1,1.588498
291,LUNG1-292,66.2149,4.0,4,0,IIIb,squamous cell carcinoma,male,232,1,1.671252


In [79]:
Datos_NoNulos=Datos_NoNulos.drop([166], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([231], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([291], axis=0, inplace=False)
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 361 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         361 non-null    object 
 1   age               361 non-null    float64
 2   clinical.T.Stage  361 non-null    float64
 3   Clinical.N.Stage  361 non-null    int64  
 4   Clinical.M.Stage  361 non-null    int64  
 5   Overall.Stage     361 non-null    object 
 6   Histology         361 non-null    object 
 7   gender            361 non-null    object 
 8   Survival.time     361 non-null    int64  
 9   deadstatus.event  361 non-null    int64  
 10  F.analysis        361 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 33.8+ KB


### Clinical M Stage

In [80]:
M_rows = Datos_NoNulos[Datos_NoNulos['Clinical.M.Stage'] >2]
M_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
71,LUNG1-072,71.4743,4.0,3,3,IIIb,nos,male,377,1,1.542416
255,LUNG1-256,53.0842,4.0,2,3,IIIb,large cell,male,291,1,1.658656
268,LUNG1-269,73.0595,3.0,3,3,IIIa,large cell,male,193,1,1.382715
332,LUNG1-333,63.6988,1.0,2,3,IIIa,adenocarcinoma,male,2985,1,1.223261


In [81]:
Datos_NoNulos=Datos_NoNulos.drop([71], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([255], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([268], axis=0, inplace=False)
Datos_NoNulos=Datos_NoNulos.drop([332], axis=0, inplace=False)
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         357 non-null    object 
 1   age               357 non-null    float64
 2   clinical.T.Stage  357 non-null    float64
 3   Clinical.N.Stage  357 non-null    int64  
 4   Clinical.M.Stage  357 non-null    int64  
 5   Overall.Stage     357 non-null    object 
 6   Histology         357 non-null    object 
 7   gender            357 non-null    object 
 8   Survival.time     357 non-null    int64  
 9   deadstatus.event  357 non-null    int64  
 10  F.analysis        357 non-null    float64
dtypes: float64(3), int64(4), object(4)
memory usage: 33.5+ KB


De este modo se elimina un 15.4 % de los datos, entre datos nulos y erróneos

## Valores erróneos Histología

### Histology nos
Datos sin Histología definida

In [82]:
H_rows = Datos_NoNulos[Datos_NoNulos['Histology'] =='nos']
H_rows

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
12,LUNG1-013,65.3635,2.0,0,0,I,nos,male,3614,1,1.278645
15,LUNG1-016,79.1129,2.0,0,0,I,nos,male,101,1,1.799757
18,LUNG1-019,74.82,2.0,0,0,I,nos,male,336,1,1.682059
50,LUNG1-051,51.6906,2.0,0,0,I,nos,male,210,1,1.693693
56,LUNG1-057,74.9076,2.0,2,0,IIIa,nos,male,98,1,1.688905
58,LUNG1-059,49.2594,2.0,2,0,IIIa,nos,male,670,1,1.638558
61,LUNG1-062,52.7118,4.0,2,0,IIIb,nos,female,220,1,1.140228
64,LUNG1-065,76.3395,2.0,3,0,IIIb,nos,male,131,1,1.983463
66,LUNG1-067,75.9671,1.0,2,0,IIIa,nos,male,1596,0,2.602347
68,LUNG1-069,56.8214,4.0,0,0,IIIb,nos,female,325,1,1.848644


In [83]:
H_rows.count()

PatientID           56
age                 56
clinical.T.Stage    56
Clinical.N.Stage    56
Clinical.M.Stage    56
Overall.Stage       56
Histology           56
gender              56
Survival.time       56
deadstatus.event    56
F.analysis          56
dtype: int64

Si se eliminaran estos pacientes de Histología estaríamos hablando de una reducción del 28,67% de los datos por lo que inicialmente se dejan. Se evaluará el rendimiento del modelo con y sin esa columna.

##  Encoding de datos categóricos

### Overall.Stage

In [84]:
 from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()

In [85]:
Datos_NoNulos['Overall.Stage']= enc.fit_transform(Datos_NoNulos[['Overall.Stage']])

In [86]:
Datos_NoNulos.head()


Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event,F.analysis
0,LUNG1-001,78.7515,2.0,3,0,3.0,large cell,male,2165,1,1.771485
1,LUNG1-002,83.8001,2.0,0,0,0.0,squamous cell carcinoma,male,155,1,1.881307
2,LUNG1-003,68.1807,2.0,3,0,3.0,large cell,male,256,1,1.376259
3,LUNG1-004,70.8802,2.0,1,0,1.0,squamous cell carcinoma,male,141,1,1.73471
4,LUNG1-005,80.4819,4.0,2,0,3.0,squamous cell carcinoma,male,353,1,1.581113


In [87]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   PatientID         357 non-null    object 
 1   age               357 non-null    float64
 2   clinical.T.Stage  357 non-null    float64
 3   Clinical.N.Stage  357 non-null    int64  
 4   Clinical.M.Stage  357 non-null    int64  
 5   Overall.Stage     357 non-null    float64
 6   Histology         357 non-null    object 
 7   gender            357 non-null    object 
 8   Survival.time     357 non-null    int64  
 9   deadstatus.event  357 non-null    int64  
 10  F.analysis        357 non-null    float64
dtypes: float64(4), int64(4), object(3)
memory usage: 33.5+ KB


### Histology

In [88]:
dummies = pd.get_dummies(Datos_NoNulos['Histology'])
dummies

Unnamed: 0,adenocarcinoma,large cell,nos,squamous cell carcinoma
0,0,1,0,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
416,0,0,0,1
417,1,0,0,0
418,0,0,0,1
419,0,0,0,1


In [89]:
Datos_NoNulos = pd.concat([
    Datos_NoNulos.drop("Histology", axis = 1),
    dummies
], axis = 1)
Datos_NoNulos.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,gender,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma
0,LUNG1-001,78.7515,2.0,3,0,3.0,male,2165,1,1.771485,0,1,0,0
1,LUNG1-002,83.8001,2.0,0,0,0.0,male,155,1,1.881307,0,0,0,1
2,LUNG1-003,68.1807,2.0,3,0,3.0,male,256,1,1.376259,0,1,0,0
3,LUNG1-004,70.8802,2.0,1,0,1.0,male,141,1,1.73471,0,0,0,1
4,LUNG1-005,80.4819,4.0,2,0,3.0,male,353,1,1.581113,0,0,0,1


In [90]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PatientID                357 non-null    object 
 1   age                      357 non-null    float64
 2   clinical.T.Stage         357 non-null    float64
 3   Clinical.N.Stage         357 non-null    int64  
 4   Clinical.M.Stage         357 non-null    int64  
 5   Overall.Stage            357 non-null    float64
 6   gender                   357 non-null    object 
 7   Survival.time            357 non-null    int64  
 8   deadstatus.event         357 non-null    int64  
 9   F.analysis               357 non-null    float64
 10  adenocarcinoma           357 non-null    uint8  
 11  large cell               357 non-null    uint8  
 12  nos                      357 non-null    uint8  
 13  squamous cell carcinoma  357 non-null    uint8  
dtypes: float64(4), int64(4), o

### gender

In [91]:
dummies = pd.get_dummies(Datos_NoNulos['gender'])
dummies

Unnamed: 0,female,male
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
416,0,1
417,0,1
418,0,1
419,0,1


In [92]:
Datos_NoNulos = pd.concat([
    Datos_NoNulos.drop("gender", axis = 1),
    dummies
], axis = 1)
Datos_NoNulos.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma,female,male
0,LUNG1-001,78.7515,2.0,3,0,3.0,2165,1,1.771485,0,1,0,0,0,1
1,LUNG1-002,83.8001,2.0,0,0,0.0,155,1,1.881307,0,0,0,1,0,1
2,LUNG1-003,68.1807,2.0,3,0,3.0,256,1,1.376259,0,1,0,0,0,1
3,LUNG1-004,70.8802,2.0,1,0,1.0,141,1,1.73471,0,0,0,1,0,1
4,LUNG1-005,80.4819,4.0,2,0,3.0,353,1,1.581113,0,0,0,1,0,1


In [93]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PatientID                357 non-null    object 
 1   age                      357 non-null    float64
 2   clinical.T.Stage         357 non-null    float64
 3   Clinical.N.Stage         357 non-null    int64  
 4   Clinical.M.Stage         357 non-null    int64  
 5   Overall.Stage            357 non-null    float64
 6   Survival.time            357 non-null    int64  
 7   deadstatus.event         357 non-null    int64  
 8   F.analysis               357 non-null    float64
 9   adenocarcinoma           357 non-null    uint8  
 10  large cell               357 non-null    uint8  
 11  nos                      357 non-null    uint8  
 12  squamous cell carcinoma  357 non-null    uint8  
 13  female                   357 non-null    uint8  
 14  male                     3

## Normalización de los datos

In [94]:
df_normalizar=Datos_NoNulos[["age" , "clinical.T.Stage","Clinical.N.Stage","Clinical.M.Stage","Overall.Stage", "F.analysis"]]
col_norm=df_normalizar.columns
col_norm

Index(['age', 'clinical.T.Stage', 'Clinical.N.Stage', 'Clinical.M.Stage',
       'Overall.Stage', 'F.analysis'],
      dtype='object')

In [95]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(df_normalizar)

In [96]:
scaled_df = pd.DataFrame(scaled_df, columns=col_norm)

In [97]:
scaled_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               357 non-null    float64
 1   clinical.T.Stage  357 non-null    float64
 2   Clinical.N.Stage  357 non-null    float64
 3   Clinical.M.Stage  357 non-null    float64
 4   Overall.Stage     357 non-null    float64
 5   F.analysis        357 non-null    float64
dtypes: float64(6)
memory usage: 16.9 KB


In [98]:
filas_nonulos=list()
filas_norm=list()
for indice_fila,fila in Datos_NoNulos.iterrows():
      filas_nonulos.append(indice_fila)
for indice_fil_norm, fila_norm in scaled_df.iterrows():
    filas_norm.append(indice_fil_norm)
    

In [99]:
filas_nonulos

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 21,
 23,
 24,
 25,
 26,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 39,
 40,
 41,
 42,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 66,
 67,
 68,
 69,
 70,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 80,
 81,
 83,
 85,
 87,
 88,
 90,
 91,
 92,
 94,
 95,
 96,
 97,
 98,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 128,
 129,
 130,
 131,
 133,
 134,
 136,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 149,
 150,
 151,
 152,
 154,
 155,
 157,
 158,
 159,
 161,
 162,
 163,
 164,
 165,
 167,
 168,
 169,
 171,
 172,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 2

In [100]:
filas_norm

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [101]:
for i,j in zip(filas_nonulos,filas_norm):
    Datos_NoNulos.at[i,'age']=scaled_df.at[j,'age']
    Datos_NoNulos.at[i,'clinical.T.Stage']=scaled_df.at[j,'clinical.T.Stage']
    Datos_NoNulos.at[i,'Clinical.N.Stage']=scaled_df.at[j,'Clinical.N.Stage']
    Datos_NoNulos.at[i,'Clinical.M.Stage']=scaled_df.at[j,'Clinical.M.Stage']
    Datos_NoNulos.at[i,'Overall.Stage']=scaled_df.at[j,'Overall.Stage']
    Datos_NoNulos.at[i,'F.analysis']=scaled_df.at[j,'F.analysis']
   
    


In [102]:
Datos_NoNulos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PatientID                357 non-null    object 
 1   age                      357 non-null    float64
 2   clinical.T.Stage         357 non-null    float64
 3   Clinical.N.Stage         357 non-null    int64  
 4   Clinical.M.Stage         357 non-null    int64  
 5   Overall.Stage            357 non-null    float64
 6   Survival.time            357 non-null    int64  
 7   deadstatus.event         357 non-null    int64  
 8   F.analysis               357 non-null    float64
 9   adenocarcinoma           357 non-null    uint8  
 10  large cell               357 non-null    uint8  
 11  nos                      357 non-null    uint8  
 12  squamous cell carcinoma  357 non-null    uint8  
 13  female                   357 non-null    uint8  
 14  male                     3

In [103]:
Datos_NoNulos.describe()

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma,female,male
count,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0,357.0
mean,0.594321,0.515406,0.212885,0.0,0.663866,1002.453782,0.879552,0.465397,0.134454,0.296919,0.156863,0.411765,0.322129,0.677871
std,0.175639,0.363987,0.409922,0.0,0.370146,1032.290965,0.325942,0.152431,0.341618,0.457542,0.364182,0.492844,0.467948,0.467948
min,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.475076,0.333333,0.0,0.0,0.333333,265.0,1.0,0.364619,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.604324,0.333333,0.0,0.0,0.666667,575.0,1.0,0.481216,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.734377,1.0,0.0,0.0,1.0,1490.0,1.0,0.575383,0.0,1.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,0.0,1.0,4328.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [104]:
Datos_NoNulos

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Survival.time,deadstatus.event,F.analysis,adenocarcinoma,large cell,nos,squamous cell carcinoma,female,male
0,LUNG1-001,0.776751,0.333333,1,0,1.000000,2165,1,0.600743,0,1,0,0,0,1
1,LUNG1-002,0.863766,0.333333,0,0,0.000000,155,1,0.653516,0,0,0,1,0,1
2,LUNG1-003,0.594556,0.333333,1,0,1.000000,256,1,0.410823,0,1,0,0,0,1
3,LUNG1-004,0.641084,0.333333,0,0,0.333333,141,1,0.583071,0,0,0,1,0,1
4,LUNG1-005,0.806575,1.000000,0,0,1.000000,353,1,0.509262,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,LUNG1-417,0.549415,1.000000,0,0,1.000000,648,1,0.541115,0,0,0,1,0,1
417,LUNG1-418,0.344476,0.333333,0,0,0.000000,346,1,0.538769,1,0,0,0,0,1
418,LUNG1-419,0.565754,1.000000,0,0,1.000000,2772,0,0.523868,0,0,0,1,0,1
419,LUNG1-420,0.684183,0.333333,0,0,0.333333,2429,1,0.413147,0,0,0,1,0,1


## CONVERSION DATAFRAME A CSV

In [105]:
Datos_NoNulos.to_csv('df_sin_imputaciones_norm.csv')

## MODELO RANDOM SURVIVAL FOREST

In [106]:
X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis','adenocarcinoma','nos','squamous cell carcinoma','large cell','Overall.Stage','female','male']]

In [107]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      357 non-null    float64
 1   clinical.T.Stage         357 non-null    float64
 2   Clinical.N.Stage         357 non-null    int64  
 3   Clinical.M.Stage         357 non-null    int64  
 4   F.analysis               357 non-null    float64
 5   adenocarcinoma           357 non-null    uint8  
 6   nos                      357 non-null    uint8  
 7   squamous cell carcinoma  357 non-null    uint8  
 8   large cell               357 non-null    uint8  
 9   Overall.Stage            357 non-null    float64
 10  female                   357 non-null    uint8  
 11  male                     357 non-null    uint8  
dtypes: float64(4), int64(2), uint8(6)
memory usage: 29.7 KB


In [108]:
Y_df=Datos_NoNulos[['deadstatus.event','Survival.time']]

In [109]:
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  357 non-null    int64
 1   Survival.time     357 non-null    int64
dtypes: int64(2)
memory usage: 16.5 KB


In [110]:
Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  357 non-null    bool 
 1   Survival.time     357 non-null    int64
dtypes: bool(1), int64(1)
memory usage: 14.0 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')


In [111]:
Y=Y_df.to_records(index=False)
Y

rec.array([( True, 2165), ( True,  155), ( True,  256), ( True,  141),
           ( True,  353), ( True,  173), ( True,  137), ( True,   77),
           ( True,  131), (False, 2119), ( True,  515), ( True,   85),
           ( True, 3614), ( True, 1247), ( True, 1238), ( True,  101),
           ( True,  220), ( True, 1926), ( True,  336), ( True,  442),
           ( True, 1141), ( True, 1883), ( True,   25), (False, 1972),
           ( True,  257), ( True,  303), ( True,  999), ( True,  543),
           ( True,  456), ( True,  597), ( True,   98), ( True,  366),
           ( True,  464), ( True,  370), ( True,  558), ( True,  136),
           ( True,  134), ( True,  183), ( True, 1070), ( True,   73),
           (False, 1810), (False, 4328), ( True, 1670), ( True,  208),
           ( True,  210), ( True,   73), ( True,   78), ( True, 1076),
           ( True,  192), ( True,   98), ( True,  673), ( True,  670),
           ( True,   51), (False, 1573), ( True,  220), (False, 1630),
      

In [112]:
random_state = 20

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=random_state)

In [113]:
rsf = RandomSurvivalForest(
    n_estimators=1000, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=random_state
)
rsf.fit(X_train, y_train)

In [114]:
rsf.score(X_test, y_test)

0.5443169968717414

In [115]:
from sklearn.inspection import permutation_importance

result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=random_state)

In [116]:
pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

Unnamed: 0,importances_mean,importances_std
age,0.017744,0.024127
F.analysis,0.004101,0.027462
squamous cell carcinoma,0.000139,0.008383
Clinical.M.Stage,0.0,0.0
adenocarcinoma,-0.001025,0.000854
nos,-0.004049,0.002312
Clinical.N.Stage,-0.004206,0.003842
large cell,-0.004362,0.002592
female,-0.008985,0.004969
Overall.Stage,-0.009402,0.0036


In [117]:
X_test_sorted = X_test.sort_values(by=[ "F.analysis","age"])
X_test_sel = pd.concat((X_test_sorted.head(3), X_test_sorted.tail(3)))

X_test_sel

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,F.analysis,adenocarcinoma,nos,squamous cell carcinoma,large cell,Overall.Stage,female,male
280,0.436888,0.333333,0,0,0.237172,0,0,0,1,0.333333,0,1
14,0.655619,0.0,0,0,0.248322,0,0,0,1,0.0,0,1
111,0.657694,0.333333,0,0,0.273854,0,1,0,0,0.0,0,1
395,0.439107,1.0,0,0,0.729391,0,0,1,0,1.0,1,0
195,0.886371,0.333333,0,0,0.768823,0,0,0,1,0.666667,1,0
209,0.842722,0.333333,0,0,0.963879,0,0,1,0,0.0,0,1


In [118]:
pd.Series(rsf.predict(X_test_sel))

0    120.751570
1    124.866751
2    120.950278
3    189.925252
4    224.263052
5    258.367546
dtype: float64

## Modelo RSF quitando  columnas

In [119]:
#X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis','adenocarcinoma','nos','squamous cell carcinoma','large cell','Overall.Stage']]
#X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis','Overall.Stage']]
#X=Datos_NoNulos[['age','clinical.T.Stage','Clinical.N.Stage','Clinical.M.Stage','F.analysis']]
X=Datos_NoNulos[['age','F.analysis','Clinical.N.Stage']]
#X=Datos_NoNulos[['age']]
#X=Datos_NoNulos[['F.analysis']]

In [120]:
Y_df=Datos_NoNulos[['deadstatus.event','Survival.time']]
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  357 non-null    int64
 1   Survival.time     357 non-null    int64
dtypes: int64(2)
memory usage: 16.5 KB


In [121]:
Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')
Y_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357 entries, 0 to 420
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   deadstatus.event  357 non-null    bool 
 1   Survival.time     357 non-null    int64
dtypes: bool(1), int64(1)
memory usage: 14.0 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_df['deadstatus.event']=Y_df['deadstatus.event'].astype('bool')


In [122]:
Y=Y_df.to_records(index=False)
#Y

In [123]:
random_state = 20

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=random_state)

In [124]:
rsf = RandomSurvivalForest(
    n_estimators=1000, min_samples_split=10, min_samples_leaf=15, n_jobs=-1, random_state=random_state
)
rsf.fit(X_train, y_train)

In [125]:
rsf.score(X_test, y_test)

0.5650417101147028

In [126]:
from sklearn.inspection import permutation_importance

result = permutation_importance(rsf, X_test, y_test, n_repeats=15, random_state=random_state)

In [127]:
pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

Unnamed: 0,importances_mean,importances_std
age,0.025365,0.024733
F.analysis,0.016041,0.029285
Clinical.N.Stage,0.005518,0.009209


In [128]:
X_test_sorted = X_test.sort_values(by=[ "F.analysis","age"])
X_test_sel = pd.concat((X_test_sorted.head(3), X_test_sorted.tail(3)))

X_test_sel

Unnamed: 0,age,F.analysis,Clinical.N.Stage
280,0.436888,0.237172,0
14,0.655619,0.248322,0
111,0.657694,0.273854,0
395,0.439107,0.729391,0
195,0.886371,0.768823,0
209,0.842722,0.963879,0


In [129]:
pd.Series(rsf.predict(X_test_sel))

0    122.762074
1    114.056492
2    114.745637
3    205.583925
4    267.224379
5    266.868948
dtype: float64