# ANÁLISIS DE LA PREDICCIÓN DE ICTUS

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


**Dataset:** healthcare-dataset-stroke-data.csv  
**Descripción:** Datos de 5110 personas distintas, distinguiendo entre su ID, sexo, edad, tipo de trabajo
y residencia, si están casados, fuman, tienen hipertensión, enfermedad cardiaca, han tenido ictus, o 
nivel de glucosa y BMI.  
**Periodo:**  No especifica.


| Nombre del campo | Descripción                                      | Tipo de variable | Importancia Inicial | Notas |
|------------------|--------------------------------------------------|------------------|---------------------|-------|
| id               | id de cada persona                               | Índice único     |                     |       |
| gender           | sexo femenino/masculino                          | Categórica       |                     |       |
| age              | edad de la persona                               | Numérica discreta|                     |       |
| hypertension     | si tiene hipertensión o no                       | Binaria          |                     |       |
| heart_disease    | si tiene una cardiopatía o no                    | Binaria          |                     |       |
| ever_married     | si están casados o no                            | Binaria          |                     |       |
| work_type        | tipo de trabajo                                  | Categórica       |                     |       |
| residence_type   | tipo de residencia                               | Binaria          |                     |       |
| avg_glucose_level| valor promedio de glucosa                        | Numérica continua|                     |       |
| bmi              | (Body Mass Index) - IMC, índice de Masa Corporal | Numérica discreta|                     |       |
| smoking_status   | si fuma o no                                     | Categórica       |                     |       |
| stroke           | si ha tenido ictus o no                          | Binaria          |                     |       |



In [4]:
# Cargamos el df para visualizarlo:
df_stroke_prediction = pd.read_csv("../data/healthcare-dataset-stroke-data.csv")
df_stroke_prediction

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [6]:
# Vamos a buscar la cardinalidad de las variables para clasificarlas:

df_tipos_variables = pd.DataFrame([df_stroke_prediction.nunique(), df_stroke_prediction.nunique()/len(df_stroke_prediction) * 100, df_stroke_prediction.dtypes]).T.rename(columns = {0: "Card",\
                                                                                                                                 1: "%_Card", 2: "Tipo"})
df_tipos_variables

Unnamed: 0,Card,%_Card,Tipo
id,5110,100.0,int64
gender,3,0.058708,object
age,104,2.035225,float64
hypertension,2,0.039139,int64
heart_disease,2,0.039139,int64
ever_married,2,0.039139,object
work_type,5,0.097847,object
Residence_type,2,0.039139,object
avg_glucose_level,3979,77.866928,float64
bmi,418,8.180039,float64


In [7]:
# Las clasificamos:

df_tipos_variables["Clasificada_como"] = "Categorica" # PArtiendo de que casi todas parecen categóricas
df_tipos_variables.loc[df_tipos_variables.Card == 2, "Clasificada_como"] = "Binaria"
df_tipos_variables.loc[df_tipos_variables["Card"] > 10, "Clasificada_como"] ="Numerica Discreta"
df_tipos_variables.loc[df_tipos_variables["%_Card"] > 30, "Clasificada_como"] = "Numerica Continua"
df_tipos_variables

Unnamed: 0,Card,%_Card,Tipo,Clasificada_como
id,5110,100.0,int64,Numerica Continua
gender,3,0.058708,object,Categorica
age,104,2.035225,float64,Numerica Discreta
hypertension,2,0.039139,int64,Binaria
heart_disease,2,0.039139,int64,Binaria
ever_married,2,0.039139,object,Binaria
work_type,5,0.097847,object,Categorica
Residence_type,2,0.039139,object,Binaria
avg_glucose_level,3979,77.866928,float64,Numerica Continua
bmi,418,8.180039,float64,Numerica Discreta


In [None]:
df_stroke_prediction.info()
# Solamente hay una columna con null: bmi
# bmi es una variable tipo numérica discreta


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [None]:
# buscamos los null:
df_stroke_prediction["bmi"].value_counts(dropna=False)
# hay 201 valores con null

bmi
NaN     201
28.7     41
28.4     38
27.6     37
26.7     37
       ... 
11.5      1
40.6      1
53.9      1
97.6      1
14.9      1
Name: count, Length: 419, dtype: int64

In [11]:
# Vamos a confirmar los null en la columna bmi:
df_stroke_prediction.loc[df_stroke_prediction["bmi"].isna()]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
13,8213,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,25226,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
27,61843,Male,58.0,0,0,Yes,Private,Rural,189.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5039,42007,Male,41.0,0,0,No,Private,Rural,70.15,,formerly smoked,0
5048,28788,Male,40.0,0,0,Yes,Private,Urban,191.15,,smokes,0
5093,32235,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5099,7293,Male,40.0,0,0,Yes,Private,Rural,83.94,,smokes,0


In [37]:
# vamos a buscar la media de bmi para sustituir los null por la media, ya que es una variable numérica discreta:

df_stroke = df_stroke_prediction.copy()
bmi_mean = df_stroke["bmi"].mean().round(1)

# y la moda es 28.9

In [42]:
# y sustituimos los null por la media:
df_stroke.loc[df_stroke["bmi"].isna(), "bmi"] = bmi_mean
df_stroke

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,28.9,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [43]:
# también, la columna age es float, debería ser int:
df_stroke["age"] = df_stroke["age"].astype(int)

In [44]:
# y confirmamos que ya no tenemos null en bmi, y que age es int:
df_stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   int64  
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 479.2+ KB


In [45]:
df_stroke.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,80,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59,0,0,Yes,Private,Rural,76.15,28.9,Unknown,1
9,60491,Female,78,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1
