# Exploratory Data Analysis - Insurance Risk Analytics

This notebook contains exploratory data analysis for the insurance risk analytics project.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path('..') / 'src'))

from utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR


## Load Data


In [None]:
# Load raw data
data_path = RAW_DATA_DIR / 'insurance.csv'

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Data loaded: {df.shape}")
    print(df.head())
else:
    print(f"Data file not found at {data_path}")
    print("Please add insurance.csv to data/raw/ and track with DVC")


## Data Overview


In [None]:
if 'df' in locals() and not df.empty:
    print("Dataset Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())


## Visualizations

Create your EDA visualizations here. Aim for at least 3 creative plots that provide insights.


In [None]:
# Example: Distribution plot
if 'df' in locals() and not df.empty and 'charges' in df.columns:
    plt.figure(figsize=(12, 6))
    df['charges'].hist(bins=50, edgecolor='black')
    plt.title('Distribution of Insurance Charges', fontsize=14, fontweight='bold')
    plt.xlabel('Charges ($)', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.tight_layout()
    plt.show()


## Hypothesis Testing

Perform hypothesis tests to identify low-risk groups.


In [None]:
from scipy import stats

# Example hypothesis test
if 'df' in locals() and not df.empty:
    if 'smoker' in df.columns and 'charges' in df.columns:
        smoker_charges = df[df['smoker'] == 'yes']['charges'] if 'yes' in df['smoker'].values else pd.Series()
        non_smoker_charges = df[df['smoker'] == 'no']['charges'] if 'no' in df['smoker'].values else pd.Series()
        
        if len(smoker_charges) > 0 and len(non_smoker_charges) > 0:
            t_stat, p_value = stats.ttest_ind(smoker_charges, non_smoker_charges)
            print(f"T-test: Smoker vs Non-smoker charges")
            print(f"T-statistic: {t_stat:.4f}")
            print(f"P-value: {p_value:.4f}")
            print(f"Significant: {p_value < 0.05}")


## Summary

Document your findings and insights here.
