In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
path = 'dataset.csv'
df = pd.read_csv(path)

In [None]:
df.head() # df.head(20)

In [None]:
df.sample(10)

In [None]:
df.columns  # tolist() is not necessary

In [None]:
df.shape # row and col

In [None]:
df['Stroke']

In [None]:
df['Stroke'].value_counts()

In [None]:
df['Sex'].value_counts()

In [None]:
out = df['Stroke'].value_counts()
px.pie(out, out.index, out.values)

In [None]:
out = df['Diabetes'].value_counts()
px.pie(out, out.index, out.values)

In [None]:
px.scatter(df, x='Age', y='BMI', color='Stroke')

In [None]:
stroke_df = df[df['Stroke'] == 1.0]
out = stroke_df['Diabetes'].value_counts()
px.pie(out, out.index, out.values, hole=.7)

In [None]:
stroke_df.sort_values('BMI', ascending=False)

In [None]:
df['Sex'] == 0 # female filter

In [None]:
female_df = df[df['Sex'] == 0]
female_df.head()

In [None]:
avg_bmi = df['BMI'].mean()
df['BMI'] < avg_bmi # bmi filter for less than average

In [None]:
belowavg_bmi_df = df[df['BMI'] < avg_bmi] # only rows with bmi less than average\
belowavg_bmi_df

In [None]:
# only male patients with diabetes
f1 = df['Sex'] == 1
f2 = df['Diabetes'] != 0
out_df = df[f1 & f2]
out_df

In [None]:
df['GenHlth'].value_counts()

In [None]:
# only patients with health 1, 2, 3
f1 = df['GenHlth'] == 1
f2 = df['GenHlth'] == 2
f3 = df['GenHlth'] == 3
df[f1 | f2 | f3] # or operator

In [None]:
df[~(f1 | f2 | f3)] # not 1, 2, 3

Grouping and pivot table

In [None]:
px.violin(df,x='BMI')

In [None]:
colors = ['red', 'blue', 'green', 'yellow', 'black']
px.violin(df,x='BMI', color='GenHlth', color_discrete_sequence=colors)

In [None]:
df.groupby('Sex')['BMI'].mean()

In [None]:
out_df = df.groupby('GenHlth')['BMI'].mean()
px.bar(out_df, out_df.index, out_df.values, log_y=True)

In [None]:
out_df = df.groupby('GenHlth')[['BMI', 'Age']].mean()
out_df

In [None]:
px.bar(out_df, out_df.index, out_df.columns)

pivot table

In [None]:
# health wise, bmi and stroke distribution
out_df = df.pivot_table(
    values = 'BMI',   # always numerical column
    index = 'GenHlth', # always categorical column
    columns='Income',  # always categorical column
    aggfunc='sum'
)
out_df

In [None]:
out_df = out_df.rename(columns={
    1.0: '1',
    2.0: '2',
    3.0: '3',
    4.0: '4',
    5.0: '5',
    6.0: '6',
    7.0: '7',
    8.0: '8',
})
out_df

In [None]:
out_df.plot(kind='bar')