In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

In [None]:
data = pd.read_csv('insurance.csv')
data.head()

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data[data.duplicated(keep=False)]

In [None]:
data.drop_duplicates()

In [None]:
data['smoker'] = np.where(data['smoker']=="no", 0, 1)
data['sex'] = np.where(data['sex']=='male', 0, 1)
data.head()

# EDA of Age Groups

In [None]:
# If you haven't already created age groups:
bins = [17, 24, 44, 65]
labels = ['Young Adult (18–24)', 'Adult (25–44)', 'Middle-Aged (44–65)']
data['age_group'] = pd.cut(data['age'], bins=bins, labels=labels, include_lowest=True)

# Summary of charges by age group
age_summary = (
    data.groupby('age_group')['charges']
        .agg(count='count', mean='mean', median='median', std='std', min='min', max='max')
        .round(2)
)
print(age_summary)

In [None]:
plt.figure(figsize=(8,6))
plt.hist(data['age'], bins=20, color='steelblue', edgecolor='black')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.regplot(x='age', y='charges', data=data, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title('Correlation Between Age and Charges')
plt.xlabel('Age')
plt.ylabel('Charges')
plt.show()


In [None]:
plt.figure(figsize=(8,6))
viridis_colors = cm.viridis_r(np.linspace(0, 1, len(labels)))
sns.boxplot(x='age_group', y='charges', data=data, palette=viridis_colors)
plt.title('Distribution of Insurance Charges by Age Group', fontsize=12)
plt.xlabel('Age Group')
plt.ylabel('Charges')
plt.show()


In [None]:
# Create 3x1 grid
fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)

# Get the unique age groups in order
groups = data['age_group'].cat.categories

# Loop through each group and plot on its own axis
for i, group in enumerate(groups):
    subset = data[data['age_group'] == group]
    sns.scatterplot(ax=axes[i], data=subset, x='age', y='charges', 
                    color=cm.viridis_r(np.linspace(0, 1, len(groups)))[i], alpha=0.6)
    
    # Optional regression line
    sns.regplot(ax=axes[i], data=subset, x='age', y='charges', 
                scatter=False, color='red', line_kws={'lw':2})
    
    # Add title for each subplot
    axes[i].set_title(f"{group}", fontsize=12)
    axes[i].set_xlabel("Age")
    axes[i].set_ylabel("Charges")

# Adjust layout
plt.suptitle("Scatterplots of Age vs Charges by Age Group", fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


# EDA of Smoker vs Non-smoker

In [None]:
smoker_summary = (data
           .groupby('smoker')['charges']
           .agg(['count','mean','median','std','min','max'])
           .round(2))
print(smoker_summary)

In [None]:
palette = sns.color_palette("viridis", 2)
plt.figure(figsize=(15,7))
sns.histplot(data=data, x='charges', hue='smoker', bins=30, kde=True, stat='count', common_norm=False, palette=palette)
plt.title('Charges Distribution by Smoker Status')
plt.xlabel('Charges')
plt.ylabel('Count')
plt.legend(title='Smoker Status', labels=['Smoker', 'Non-Smoker'])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=data, x='smoker', y='charges', palette=palette)
plt.title('Charges by Smoker Status')
plt.xlabel('')
plt.xticks(ticks=[0, 1], labels=['Non-smoker', 'Smoker'])
plt.ylabel('Charges')
plt.tight_layout()
plt.show()