In [24]:
import pandas as pd
import plotly.express as px

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split

In [2]:
stroke_df = pd.read_csv("data/stroke-data.csv")

In [15]:
stroke_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    stroke_df.drop("stroke", axis=1, inplace=False),
    stroke_df["stroke"],
    test_size = 0.2,
    stratify = stroke_df["stroke"],
    random_state = 123
)

In [33]:
train_df = X_train
train_df["stroke"] = y_train

In [35]:
test_df = X_test
test_df["stroke"] = y_test

In [68]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                3923 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


In [70]:
train_df["bmi"].mean()

28.93163395360697

In [71]:
train_df["bmi"].fillna(train_df["bmi"].mean(), inplace = True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                4088 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


In [36]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1022 entries, 2245 to 1173
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1022 non-null   int64  
 1   gender             1022 non-null   object 
 2   age                1022 non-null   float64
 3   hypertension       1022 non-null   int64  
 4   heart_disease      1022 non-null   int64  
 5   ever_married       1022 non-null   object 
 6   work_type          1022 non-null   object 
 7   Residence_type     1022 non-null   object 
 8   avg_glucose_level  1022 non-null   float64
 9   bmi                986 non-null    float64
 10  smoking_status     1022 non-null   object 
 11  stroke             1022 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 103.8+ KB


In [37]:
test_df = test_df.dropna()

In [38]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 2245 to 1173
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 986 non-null    int64  
 1   gender             986 non-null    object 
 2   age                986 non-null    float64
 3   hypertension       986 non-null    int64  
 4   heart_disease      986 non-null    int64  
 5   ever_married       986 non-null    object 
 6   work_type          986 non-null    object 
 7   Residence_type     986 non-null    object 
 8   avg_glucose_level  986 non-null    float64
 9   bmi                986 non-null    float64
 10  smoking_status     986 non-null    object 
 11  stroke             986 non-null    int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 100.1+ KB


In [77]:
fig = px.histogram(train_df, x="age", marker={color:"red"})
fig.update_layout(
    title = "Age Histogram",
    width = 400,
    height = 400,
)
print(fig)
fig.show()

NameError: name 'color' is not defined

In [48]:
fig = px.box(train_df, y="age")
fig.show()

In [49]:
fig = px.box(train_df, x="stroke", y="age")
fig.show()

In [50]:
data  = pd.crosstab(train_df["smoking_status"], train_df["stroke"])
data.head(5)

stroke,0,1
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown,1194,33
formerly smoked,663,58
never smoked,1443,70
smokes,589,38


In [62]:
fig = make_subplots(
    rows = 1,
    cols = 1
)

trace = go.Bar(x = ["unknown", "formerly smoked", "never_smoked","smokes"], y = [1194, 663, 1443, 589], name="no_stroke")
fig.append_trace(trace, 1, 1)
trace = go.Bar(x = ["unknown", "formerly smoked", "never_smoked","smokes"], y = [33, 58, 70, 38], name="stroke")
fig.append_trace(trace, 1, 1)

fig.show()

In [63]:
data.index

Index(['Unknown', 'formerly smoked', 'never smoked', 'smokes'], dtype='object', name='smoking_status')

In [64]:
data.columns

Int64Index([0, 1], dtype='int64', name='stroke')

In [66]:
data[0]

smoking_status
Unknown            1194
formerly smoked     663
never smoked       1443
smokes              589
Name: 0, dtype: int64

In [67]:
fig = make_subplots(
    rows = 1,
    cols = 1
)

trace = go.Bar(x = data.index, y = data[0], name="no_stroke")
fig.append_trace(trace, 1, 1)
trace = go.Bar(x = data.index, y = data[1], name="stroke")
fig.append_trace(trace, 1, 1)

fig.show()

In [74]:
fig = px.histogram(train_df, x="smoking_status")
fig.show()