In [49]:
import pandas as pd
import plotly.express as px

from plotly.subplots import make_subplots
import plotly.graph_objects as go


from sklearn.model_selection import train_test_split

In [6]:
stroke_df=pd.read_csv("data/stroke-data.csv")

In [7]:
stroke_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [35]:
x_train, x_test, y_train, y_test = train_test_split(
    stroke_df.drop("stroke", axis = 1, inplace = False),
    stroke_df["stroke"],
    test_size = 0.2,
    stratify = stroke_df["stroke"],
    random_state=123
)

In [36]:
train_df = x_train
train_df["stroke"] = y_train

In [37]:
train_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
795,60777,Female,31.0,0,0,Yes,Govt_job,Rural,103.55,20.5,formerly smoked,0
4106,50545,Male,41.0,0,0,Yes,Govt_job,Urban,84.1,29.3,never smoked,0
1318,7195,Male,50.0,0,1,No,Private,Urban,85.82,31.9,never smoked,0
4846,27801,Female,34.0,0,0,Yes,Private,Urban,113.26,27.6,never smoked,0
532,31564,Female,25.0,0,0,Yes,Private,Rural,90.65,20.9,Unknown,0


In [38]:
test_df = x_test
test_df["stroke"] = y_test

In [39]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1022 entries, 2245 to 1173
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1022 non-null   int64  
 1   gender             1022 non-null   object 
 2   age                1022 non-null   float64
 3   hypertension       1022 non-null   int64  
 4   heart_disease      1022 non-null   int64  
 5   ever_married       1022 non-null   object 
 6   work_type          1022 non-null   object 
 7   Residence_type     1022 non-null   object 
 8   avg_glucose_level  1022 non-null   float64
 9   bmi                986 non-null    float64
 10  smoking_status     1022 non-null   object 
 11  stroke             1022 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 103.8+ KB


In [69]:
train_df["bmi"].mean()

28.93163395360697

In [70]:
train_df["bmi"].fillna(train_df["bmi"].mean(), inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                4088 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


In [42]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 2245 to 1173
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 986 non-null    int64  
 1   gender             986 non-null    object 
 2   age                986 non-null    float64
 3   hypertension       986 non-null    int64  
 4   heart_disease      986 non-null    int64  
 5   ever_married       986 non-null    object 
 6   work_type          986 non-null    object 
 7   Residence_type     986 non-null    object 
 8   avg_glucose_level  986 non-null    float64
 9   bmi                986 non-null    float64
 10  smoking_status     986 non-null    object 
 11  stroke             986 non-null    int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 100.1+ KB


In [46]:
fig = px.histogram(train_df,x="age")
fig.update_layout(
    title = "Age histogram",
    width = 400,
    height = 400,
)
fig.show()

In [47]:
fig = px.box(train_df, y = "age")
fig.show()

In [48]:
fig = px.box(train_df,x = "stroke", y = "age")
fig.show()

In [54]:
data = pd.crosstab(train_df["smoking_status"], train_df["stroke"])
data.head(5)

stroke,0,1
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown,1194,33
formerly smoked,663,58
never smoked,1443,70
smokes,589,38


In [66]:
fig = make_subplots(
    rows = 1,
    cols = 1
)


In [67]:
trace = go.Bar(x = data.index, y = data[0], name = "no_stroke")
fig.append_trace(trace, 1 ,1)
trace = go.Bar(x = data.index, y = data[1], name = "stroke")
fig.append_trace(trace, 1 ,1)

In [68]:
fig.show()

In [65]:
data[0]

smoking_status
Unknown            1194
formerly smoked     663
never smoked       1443
smokes              589
Name: 0, dtype: int64

In [71]:
data = pd.crosstab(train_df["gender"], train_df["stroke"])
data.head(5)

stroke,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,2279,115
Male,1609,84
Other,1,0


In [90]:
fig = make_subplots(
    rows = 1,
    cols = 1
)

In [91]:
trace = go.Bar(x = data.index, y = data[0], name = "no_stroke")
fig.append_trace(trace, 1 ,1)
trace = go.Bar(x = data.index, y = data[1], name = "stroke")
fig.append_trace(trace, 1 ,1)

In [92]:
fig.show()

In [79]:
data = pd.crosstab(train_df["ever_married"], train_df["stroke"])
data.head(5)

stroke,0,1
ever_married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1372,23
Yes,2517,176


In [80]:
fig = make_subplots(
    rows = 1,
    cols = 1
)


In [81]:
trace = go.Bar(x = data.index, y = data[0], name = "no_stroke")
fig.append_trace(trace, 1 ,1)
trace = go.Bar(x = data.index, y = data[1], name = "stroke")
fig.append_trace(trace, 1 ,1)

In [82]:
fig.show()

In [94]:
fig = px.box(train_df,x = "stroke", y = "bmi")
fig.show()