In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.rcParams['figure.facecolor'] = 'white'
sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.set(rc={'ytick.labelcolor':'black','xtick.labelcolor':'black'}) 

<h1> EDA <h1>

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
for col in df.columns:
    print('Unique values in column %s' %col,'\n', df[col].unique(),
        '\nNumber of unique values in column %s :' %col, df[col].nunique())

<h3> Doing one-hot encoding <h3>

In [None]:
dummies_sex = pd.get_dummies(df['sex'], prefix='sex')
dummies_smoker = pd.get_dummies(df['smoker'], prefix='smoker')
dummies_region = pd.get_dummies(df['region'], prefix='region')

In [None]:
cols = ['sex', 'smoker', 'region']
dfs = [df, dummies_sex, dummies_smoker, dummies_region]
df_with_dummies = pd.concat(dfs, axis = 1)
df_with_dummies = df_with_dummies.drop(columns = cols)

<h2> Visualizations <h2>

In [None]:
for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.histplot(x = col, data = df)

In [None]:
for i, col in enumerate(df.columns):
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        plt.figure(i)
        sns.boxplot(x = col, data = df)

In [None]:
sns.pairplot(df)

In [None]:
for i, col in enumerate(df.columns):
    if df[col].dtype == 'int64' or df[col].dtype == 'float64':
        plt.figure(i)
        sns.violinplot(x = col, data = df)

In [None]:
information_by_age = df_with_dummies.groupby('age').agg(
    median_charges = ('charges', np.median), 
    mean_bmi = ('bmi', np.mean)
    ).reset_index()

In [None]:
percent_of_smokers_age = (df_with_dummies.groupby('age')['smoker_yes'].sum() / (df_with_dummies.groupby('age')['smoker_no'].sum() + df_with_dummies.groupby('age')['smoker_yes'].sum())).reset_index()
percent_of_smokers_age = percent_of_smokers_age.drop(columns = 'age')
information_by_age['percent_of_smokers'] = round(percent_of_smokers_age, 2) * 100

In [None]:
information_by_age.head()

In [None]:
sns.scatterplot(x = 'age', y = 'median_charges', data = information_by_age)

In [None]:
#We have one age group which stands out in case of median of charges. On the plot below we can see that this age group also has the largest proportion of smokers
information_by_age[information_by_age['median_charges'] == information_by_age['median_charges'].max()]

In [None]:
sns.scatterplot(x = 'age', y = 'mean_bmi', data = information_by_age)

In [None]:
sns.scatterplot(x = 'age', y = 'percent_of_smokers', data = information_by_age)

In [None]:
information_by_region = df.groupby('region').agg(
    median_charges = ('charges', np.median), 
    mean_bmi = ('bmi', np.mean),
    mean_children = ('children', np.mean)
    ).reset_index()

In [None]:
sns.barplot(x = 'region', y = 'median_charges', data = information_by_region)

In [None]:
sns.barplot(x = 'region', y = 'mean_bmi', data = information_by_region)

In [None]:
sns.barplot(x = 'region', y = 'mean_children', data = information_by_region)

In [None]:
df.groupby('region')['region'].count()

In [None]:
df.head()

In [None]:
df.groupby(['sex', 'smoker'])['age'].count()

In [None]:
df.groupby(['region', 'smoker'])['age'].count()

In [None]:
df.groupby(['sex']).agg(
    mean_charges = ('charges', np.mean)).reset_index()

<h2>  Thoughts after performing EDA <h2>
 Our dataset is from Kaggle so as expected it is really clean and 'pretty'. <br> 
    
   1. There are many more younger patients in our dataset <br>

   2. We can observe a linear relationship between age and [mean bmi, mean charges]. Older people probably have more advanced illnesses and that's why the treatment is more expensive <br>

   3. There are no serious outliers in our dataset, I think that we can call more extreme observations from our dataset "natural outliers", because in real life we also observe situations where a single person spends a lot more on treatment or <br>
    or has larger BMI. There is no human error in the dataset. <br>

   4. One age group (age = 43) stands out when it comes to the median of charges. This age group also has the largest proportion of smokers <br>
    
   5. When we analyze the information grouped by region, we can see slight differences (people from northeast pay more, people from southeast have bigger BMI) <br>

   6. On average, men pay more for treatment




