# Exploratory Data Analysis (EDA): Insurance Charges Prediction


In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("C:/Users/DELL/Downloads/insurance.csv")
df2 = df
df2

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Distribution of charge

sns.histplot(df2['charges'],kde= True , bins = 100)
plt.title('Premium Distribution')
plt.show()

In [None]:
# value counts for categorical features

print('Sex:\n', df2['sex'].value_counts(), '\n')
print('Smoker:\n', df2['smoker'].value_counts(), '\n')
print('Region:\n', df2['region'].value_counts(), '\n')

# Count plots
plt.figure(figsize=(15,5))

plt.subplot(1,3,1)
sns.countplot(data=df2 , x='sex', hue = 'sex' , palette='pastel')
plt.title('Count Of Sex')

plt.subplot(1,3,2)
sns.countplot(data=df2 , x = 'smoker',hue = 'smoker', palette = 'Set2')
plt.title('Count Of Smokers')

plt.subplot(1,3,3)
sns.countplot(data=df2 , x = 'region',hue = 'region', palette = 'Set2')
plt.title('Count Of Region')

plt.tight_layout()
plt.show()

In [None]:
# value counts and pie chart for categorical features

catagorical_features = {'sex','smoker','region'}
for col in catagorical_features:
    print('value count for' ,col)
    print(df2[col].value_counts())

    # pie chart
    df[col].value_counts().plot.pie(
        autopct='%1.1f%%',
        startangle=90,
        shadow=True,
        figsize=(5,5)
    )
    plt.title(f'{col.capitalize()}Distribution')
    plt.ylabel('')
    plt.show()

In [None]:
# Scatterplot : Age vs Charges

plt.figure(figsize=(7,5))
sns.scatterplot(data= df , x= 'age' , y = 'charges' , hue='smoker' ,alpha = 0.7 )
plt.title('Age vs Charges (colored by smoker)')
plt.show()

###  Insight: Age vs Insurance Charges

The scatter plot shows a clear positive relationship between age and insurance charges, 
indicating that insurance costs generally increase as age increases.

Additionally, individuals who are smokers (`smoker = yes`) have significantly higher 
insurance charges compared to non-smokers across all age groups. The difference becomes 
more pronounced at higher ages, suggesting a compounding effect of age and smoking on 
medical insurance costs.

In [None]:
# Scatterplot: BMI vs Charges
# -------------------------------
plt.figure(figsize=(7,5))
sns.scatterplot(data=df, x="bmi", y="charges", hue="smoker", alpha=0.7)
plt.title("BMI vs Charges (colored by Smoker)")
plt.show()

###  Insight: BMI vs Insurance Charges

The scatter plot indicates a positive relationship between Body Mass Index (BMI) and 
insurance charges, particularly for individuals with higher BMI values.

Smokers (`smoker = yes`) consistently incur significantly higher insurance charges 
compared to non-smokers across almost all BMI ranges. The increase in charges becomes 
more pronounced for smokers with higher BMI, suggesting that the combined effect of 
smoking and elevated BMI leads to substantially higher medical risk and insurance costs.


In [None]:
# Boxplot: Smoker vs Charges

plt.figure(figsize=(6,5))
sns.boxplot(data=df, x="smoker", y="charges",hue = 'smoker', palette="Set2")
plt.title("Smoker vs Charges")
plt.show()

###  Insight: Smoking Status vs Insurance Charges

The boxplot clearly shows that individuals who are smokers incur substantially higher 
insurance charges compared to non-smokers.

Smokers not only have a higher median insurance cost, but also exhibit a much wider 
spread in charges, indicating greater variability in medical expenses. This suggests 
that smoking status is a strong and influential factor in determining insurance charges 
and should be treated as a key predictor during model development.


In [None]:
# Boxplot: Sex vs Charges

plt.figure(figsize=(6,5))
sns.boxplot(data=df, x="sex", y="charges",hue = 'sex', palette="Set1")
plt.title("Sex vs Charges")
plt.show()

###  Insight: Sex vs Insurance Charges

The boxplot shows that while the median insurance charges for males and females are 
relatively similar, male individuals exhibit a wider spread in insurance charges 
compared to females.

This greater variability suggests that certain high-cost cases are more frequent among 
males, possibly due to differences in lifestyle, health risk factors, or smoking 
behavior. However, sex alone does not appear to be as strong a predictor of insurance 
charges as variables like smoking status or BMI.


In [None]:
# Correlation Heatmap (Numerical Features vs Charges)

df2['smoker_encoded'] = df['smoker'].map({'yes': 1, 'no': 0})
numerical_features = ["age", "bmi", "children",'smoker_encoded', "charges"]
corr = df[numerical_features].corr()

plt.figure(figsize=(8,9))
sns.heatmap(corr ,annot = True ,  cmap = 'coolwarm' , fmt = '.2f' )
plt.title("Correlation Heatmap (Numerical Features vs Charges)")
plt.show()

# Insights Extraction


* Smokers clearly have higher charges.

* Higher BMI correlates with higher charges.

* Age increases charges gradually, but spikes for smokers.

* Region might not be as impactful as smoking.