In [1]:
import pandas as pd
import numpy as np

In [2]:
# Se importa el dataset
hosp_train = pd.read_csv('./datasets/hospitalizaciones_train.csv')


Analizamos el dataset

In [3]:
hosp_train

Unnamed: 0,Available Extra Rooms in Hospital,Department,Ward_Facility_Code,doctor_name,staff_available,patientid,Age,gender,Type of Admission,Severity of Illness,health_conditions,Visitors with Patient,Insurance,Admission_Deposit,Stay (in days)
0,4,gynecology,D,Dr Sophia,0,33070,41-50,Female,Trauma,Extreme,Diabetes,4,Yes,2966.408696,8
1,4,gynecology,B,Dr Sophia,2,34808,31-40,Female,Trauma,Minor,Heart disease,2,No,3554.835677,9
2,2,gynecology,B,Dr Sophia,8,44577,21-30,Female,Trauma,Extreme,Diabetes,2,Yes,5624.733654,7
3,4,gynecology,D,Dr Olivia,7,3695,31-40,Female,Urgent,Moderate,,4,No,4814.149231,8
4,2,anesthesia,E,Dr Mark,10,108956,71-80,Male,Trauma,Moderate,Diabetes,2,No,5169.269637,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409995,1,gynecology,D,Dr Sarah,1,63105,31-40,Female,Trauma,Minor,Asthama,3,Yes,3902.291076,8
409996,5,gynecology,D,Dr Nathan,5,129292,21-30,Female,Trauma,Extreme,Other,4,No,4771.995223,9
409997,4,gynecology,D,Dr Sarah,9,11399,21-30,Female,Emergency,Minor,Asthama,4,Yes,3816.994210,9
409998,10,gynecology,D,Dr Olivia,10,75003,21-30,Female,Trauma,Moderate,Other,2,Yes,3841.577491,9


In [4]:
# Se identifican los tipos de datos.
hosp_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410000 entries, 0 to 409999
Data columns (total 15 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Available Extra Rooms in Hospital  410000 non-null  int64  
 1   Department                         410000 non-null  object 
 2   Ward_Facility_Code                 410000 non-null  object 
 3   doctor_name                        410000 non-null  object 
 4   staff_available                    410000 non-null  int64  
 5   patientid                          410000 non-null  int64  
 6   Age                                410000 non-null  object 
 7   gender                             410000 non-null  object 
 8   Type of Admission                  410000 non-null  object 
 9   Severity of Illness                410000 non-null  object 
 10  health_conditions                  410000 non-null  object 
 11  Visitors with Patient              4100

In [5]:
# Se verifica si hay datos nulos.
hosp_train.isnull().sum()

Available Extra Rooms in Hospital    0
Department                           0
Ward_Facility_Code                   0
doctor_name                          0
staff_available                      0
patientid                            0
Age                                  0
gender                               0
Type of Admission                    0
Severity of Illness                  0
health_conditions                    0
Visitors with Patient                0
Insurance                            0
Admission_Deposit                    0
Stay (in days)                       0
dtype: int64

'La columna "stay (in days)" define a un paciente que mantiene una "estancia hospitalaria prolongada" (hospitalizado) mayor a 8 días, por lo que se genera un nuevo dataframe de 'entrenamiento' y así poder ingresar la información que se describe a continuación.'

In [4]:
hosp_pr = hosp_train

In [5]:
# Para cada valor mayor o igual a 9 se cambia por 1. Los valores menores a 9 se cambian por 0.
hosp_pr['prediccion'] = hosp_pr['Stay (in days)'].apply(lambda x: 1 if x >= 9 else 0)

In [6]:
# Se dropea la columna "Stay (in days)" ya que se convirtió a "prediccion".
hosp_pr = hosp_pr.drop('Stay (in days)', axis=1)


In [7]:
# Se dropean las columnas 'innecesarias' para desarrollar nuestro modelo.
# 1) 'patientid' se dropea por que es un id, y a posteriori no genera co-relaciones con nuestro modelo (comentario del futuro)
# 2) A priori 'Ward_Facility_Code', 'staff_available' , 'Insurance', 'Visitors with Patient' y 'Admission_Deposit' son las que menos afectan 
#    nuestro modelo de predicciones.

hosp_pr = hosp_pr.drop(['patientid','Ward_Facility_Code','staff_available','Insurance','Admission_Deposit','Visitors with Patient'],axis=1)

In [8]:
# Se cambian los valores de nuestro dataframe por números; dónde '1' indica el valor de la columna correspondiente con su fila.
# Por ejemplo, las 4 primeras filas tienen un '1' en 'ginecología' por que ese es su 'departamento'.
hosp_pr = pd.get_dummies(hosp_pr)

In [11]:
# Aquí se muestra lo descrito anteriormente.
hosp_pr

Unnamed: 0,Available Extra Rooms in Hospital,prediccion,Department_TB & Chest disease,Department_anesthesia,Department_gynecology,Department_radiotherapy,Department_surgery,doctor_name_Dr Isaac,doctor_name_Dr John,doctor_name_Dr Mark,...,Type of Admission_Urgent,Severity of Illness_Extreme,Severity of Illness_Minor,Severity of Illness_Moderate,health_conditions_Asthama,health_conditions_Diabetes,health_conditions_Heart disease,health_conditions_High Blood Pressure,health_conditions_None,health_conditions_Other
0,4,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
1,4,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,2,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
3,4,0,0,0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
4,2,1,0,1,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409995,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
409996,5,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
409997,4,1,0,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
409998,10,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [12]:
# Se correlacionan nuestros datos.
hosp_pr.corr()

Unnamed: 0,Available Extra Rooms in Hospital,prediccion,Department_TB & Chest disease,Department_anesthesia,Department_gynecology,Department_radiotherapy,Department_surgery,doctor_name_Dr Isaac,doctor_name_Dr John,doctor_name_Dr Mark,...,Type of Admission_Urgent,Severity of Illness_Extreme,Severity of Illness_Minor,Severity of Illness_Moderate,health_conditions_Asthama,health_conditions_Diabetes,health_conditions_Heart disease,health_conditions_High Blood Pressure,health_conditions_None,health_conditions_Other
Available Extra Rooms in Hospital,1.0,0.053894,-0.007705,-0.003296,0.045092,-0.047634,-0.005273,-0.004803,-0.019889,-0.005534,...,0.048693,-0.037849,0.02337,0.008172,-0.001906,-0.001202,0.000288,0.002685,-0.000194,0.000254
prediccion,0.053894,1.0,0.154842,0.242282,-0.230424,-0.005953,0.07874,0.066519,0.099091,0.246904,...,-0.005982,0.004707,-0.037693,0.029866,0.005211,-0.003826,0.000423,0.006233,-0.007809,0.002432
Department_TB & Chest disease,-0.007705,0.154842,1.0,-0.06832,-0.324921,-0.098794,-0.022232,-0.018774,0.152679,0.463299,...,0.01236,0.006611,-0.010984,0.004699,-0.001433,0.001009,-0.000473,-0.000148,-0.000953,0.001846
Department_anesthesia,-0.003296,0.242282,-0.06832,1.0,-0.461274,-0.140252,-0.031561,-0.026652,0.216345,0.658104,...,-0.003358,0.014155,-0.040028,0.02472,0.002549,-0.002649,-0.000869,0.002262,-0.00145,0.000372
Department_gynecology,0.045092,-0.230424,-0.324921,-0.461274,1.0,-0.667025,-0.150102,-0.126756,-0.449541,-0.485577,...,-0.023138,-0.008677,0.078188,-0.062783,0.001113,-0.001335,0.000706,-0.00019,-4.1e-05,8.6e-05
Department_radiotherapy,-0.047634,-0.005953,-0.098794,-0.140252,-0.667025,1.0,-0.045639,-0.038541,0.315666,-0.147642,...,0.029678,-0.014,-0.050535,0.05556,-0.002081,0.002496,0.000669,-0.001621,0.002164,-0.001962
Department_surgery,-0.005273,0.07874,-0.022232,-0.031561,-0.150102,-0.045639,1.0,0.844467,-0.030758,-0.033224,...,-0.020091,0.038515,-0.036656,0.003114,-0.001602,0.002248,-0.002313,0.000836,-0.001792,0.00202
doctor_name_Dr Isaac,-0.004803,0.066519,-0.018774,-0.026652,-0.126756,-0.038541,0.844467,1.0,-0.025975,-0.028057,...,-0.018229,0.029672,-0.029826,0.003807,-0.003815,0.000433,0.004849,0.000239,-0.003745,0.004341
doctor_name_Dr John,-0.019889,0.099091,0.152679,0.216345,-0.449541,0.315666,-0.030758,-0.025975,1.0,-0.099503,...,0.012222,0.00568,-0.030473,0.022713,0.010781,0.009088,-0.068756,-0.004733,0.021504,0.000235
doctor_name_Dr Mark,-0.005534,0.246904,0.463299,0.658104,-0.485577,-0.147642,-0.033224,-0.028057,-0.099503,1.0,...,0.004711,0.012022,-0.034695,0.021616,-0.004339,-0.004769,0.027426,0.003704,-0.010307,0.001356


'En base a nuestra correlación con la columna 'prediccion', a pesar de que hay correlaciones muy bajas, no se borran algunas filas por que estamos hablando de datos importantes en cuanto a salud; serían las filas de las "precondiciones de salud" tales como 'enfermedades al corazón, 'diabetes', 'presión alta' y 'asma'. Por el contrario, un dato innecesario sería 'health_conditions_None' ya que tiene correlación negativa por lo que se dropeará.

In [9]:
# Se dropea 'health_conditions_None'.
hosp_pr = hosp_pr.drop(['health_conditions_None'],axis=1)

'Ahora se comienza el trabajo de ML'

In [10]:
# Se define 'X' e 'y'.

X = hosp_pr.drop(["prediccion"],axis=1)
y = hosp_pr["prediccion"]

In [25]:
# Se importa el módulo 'train_test_split' (de la librería de scikit-learn) y entrena nuestro modelo ('test_size', y 'random_state' pueden variar).
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [26]:
# Se importa el módulo 'LogisticRegression' (de la librería de scikit-learn) y se ejecuta.
from sklearn.linear_model import LogisticRegression
loregr = LogisticRegression (max_iter=100000)

In [27]:
loregr.fit(X_train, y_train)

In [28]:
# Se realiza una predicción.
y_pred = loregr.predict(X_test)

In [29]:
# Se verifica la 'Matriz de Confusión'.
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred)
print('Matriz de Confusión:')
print(matrix)

Matriz de Confusión:
[[15968 22693]
 [12561 51278]]


In [30]:
# Se importan los módulos 'accuracy_score' y 'recall_score' para calcular la precision y la exhaustividad del modelo.
from sklearn.metrics import accuracy_score, recall_score

precision = accuracy_score(y_test, y_pred)
print('precision del modelo:')
print(precision)

exhaustividad = recall_score(y_test, y_pred)
print('exhaustividad del modelo:')
print(exhaustividad)	

precision del modelo:
0.6560585365853658
exhaustividad del modelo:
0.8032393991133946


'Entrenamos nuestro dataset de test'

In [21]:
# Se carga el archivo para validar nuestra predicción.
hosp_test = pd.read_csv("./datasets/hospitalizaciones_test.csv")

In [22]:
# Se igualan las columnas de los dataframes a comparar (y validar).
hosp_test = hosp_test.drop(['patientid','Ward_Facility_Code','staff_available','Insurance','Admission_Deposit','Visitors with Patient'],axis=1)

In [23]:
# Se convierte a '0' y '1' igual que con el dataframe de entrenamiento (para comentario ver celda #34).
hosp_test = pd.get_dummies(hosp_test)

In [24]:
# Se verifica el contenido del dataframe.
hosp_test

Unnamed: 0,Available Extra Rooms in Hospital,Department_TB & Chest disease,Department_anesthesia,Department_gynecology,Department_radiotherapy,Department_surgery,doctor_name_Dr Isaac,doctor_name_Dr John,doctor_name_Dr Mark,doctor_name_Dr Nathan,...,Type of Admission_Urgent,Severity of Illness_Extreme,Severity of Illness_Minor,Severity of Illness_Moderate,health_conditions_Asthama,health_conditions_Diabetes,health_conditions_Heart disease,health_conditions_High Blood Pressure,health_conditions_None,health_conditions_Other
0,3,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,2,1,0,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
2,2,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,2,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,2,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,4,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
89996,13,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
89997,2,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
89998,2,0,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0


In [25]:
# Se dropea 'health_conditions_None' nuevamente para poder comparar nuestros datasets (deben tener las mismas columnas) 
# y poder hacer una predicción.
hosp_test = hosp_test.drop(['health_conditions_None'],axis=1)

In [26]:
#Se realiza una predicción.
test_pred = pd.DataFrame(loregr.predict(hosp_test))

In [27]:
# Se renombra columna 0 por 'pred'.
test_pred = test_pred.rename(columns={0:"pred"})

In [28]:
# Se valida el tipo de dato dentro de esta columna.
test_pred["pred"] = test_pred["pred"].astype(int)

In [30]:
# Se exporta nuestro modelo de predicción a un archivo '.csv'.
test_pred.to_csv("faesko.csv",index=False)