# Exploratory Data Analysis - Insurance Risk Analytics

This notebook contains exploratory data analysis for the insurance risk analytics project.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.append(str(Path('..') / 'src'))

from utils.config import RAW_DATA_DIR, PROCESSED_DATA_DIR


## Load Data


In [None]:
# Load raw data
data_path = RAW_DATA_DIR / 'insurance.csv'

if data_path.exists():
    df = pd.read_csv(data_path)
    
    # Parse date and calculate loss ratio (quick commands)
    if 'TransactionMonth' in df.columns:
        df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'])
    
    if 'TotalClaims' in df.columns and 'TotalPremium' in df.columns:
        df['loss_ratio'] = df['TotalClaims'] / df['TotalPremium'].replace(0, np.nan)
    
    print(f"Data loaded: {df.shape}")
    print(df.head())
    print(f"\nColumns: {list(df.columns)}")
else:
    print(f"Data file not found at {data_path}")
    print("Please add insurance.csv to data/raw/ and track with DVC")


## Data Overview


In [None]:
if 'df' in locals() and not df.empty:
    print("Dataset Info:")
    print(df.info())
    print("\nSummary Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())


## Visualizations

Create your EDA visualizations here. Aim for at least 3 creative plots that provide insights.


In [None]:
# Creative Plot 1: Loss Ratio by Province
if 'df' in locals() and not df.empty and 'Province' in df.columns and 'loss_ratio' in df.columns:
    prov = df.groupby('Province').agg({'TotalClaims':'sum','TotalPremium':'sum'})
    prov['loss_ratio'] = prov['TotalClaims']/prov['TotalPremium']
    prov_sorted = prov.sort_values('loss_ratio')
    
    plt.figure(figsize=(12, 8))
    colors = plt.cm.RdYlGn_r(np.linspace(0.2, 0.8, len(prov_sorted)))
    plt.barh(prov_sorted.index, prov_sorted['loss_ratio'], color=colors)
    plt.xlabel('Loss Ratio', fontsize=12, fontweight='bold')
    plt.ylabel('Province', fontsize=12, fontweight='bold')
    plt.title('Loss Ratio by Province (Lower is Better)', fontsize=14, fontweight='bold')
    plt.axvline(prov_sorted['loss_ratio'].median(), color='red', linestyle='--', label='Median')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    print("Lowest loss ratio provinces:")
    print(prov_sorted.head())


## Hypothesis Testing

Perform hypothesis tests to identify low-risk groups.


In [None]:
from scipy import stats

# Hypothesis test: Loss ratio by Province
if 'df' in locals() and not df.empty:
    if 'Province' in df.columns and 'loss_ratio' in df.columns:
        prov = df.groupby('Province').agg({'TotalClaims':'sum','TotalPremium':'sum'})
        prov['loss_ratio'] = prov['TotalClaims']/prov['TotalPremium']
        prov_sorted = prov.sort_values('loss_ratio')
        
        if len(prov_sorted) >= 2:
            lowest_prov = prov_sorted.index[0]
            highest_prov = prov_sorted.index[-1]
            
            lowest_data = df[df['Province'] == lowest_prov]['loss_ratio'].dropna()
            highest_data = df[df['Province'] == highest_prov]['loss_ratio'].dropna()
            
            if len(lowest_data) > 0 and len(highest_data) > 0:
                t_stat, p_value = stats.ttest_ind(lowest_data, highest_data)
                print(f"T-test: {lowest_prov} (lowest) vs {highest_prov} (highest) loss ratio")
                print(f"T-statistic: {t_stat:.4f}")
                print(f"P-value: {p_value:.4f}")
                print(f"Significant: {p_value < 0.05}")
                print(f"\n{lowest_prov} mean loss ratio: {lowest_data.mean():.4f}")
                print(f"{highest_prov} mean loss ratio: {highest_data.mean():.4f}")


## Summary

Document your findings and insights here.
