# Task 1: Development Setup & Initial Analysis (EDA)

## Project Overview
**AlphaCare Insurance Solutions (ACIS) - Insurance Risk Analytics & Predictive Modeling**

### Objectives
- Analyze historical claims data to identify low-risk customer segments
- Develop predictive models for premium optimization
- Provide data-driven recommendations for business strategy

### Key Business Questions
1. What are the risk differences across provinces?
2. What are the risk differences between zip codes?
3. What are the risk differences between women and men?
4. What are the risk differences between different automobile makes?

---

## 1. Environment Setup

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
import os
from datetime import datetime

# Configuration
warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

print('✅ Libraries imported successfully')
print(f'📊 Pandas version: {pd.__version__}')
print(f'🔢 NumPy version: {np.__version__}')

## 2. Data Loading & Initial Inspection

In [None]:
# Load the insurance data
# Note: Replace with actual data path when available

# For demonstration, we'll generate synthetic data
np.random.seed(42)
n_samples = 10000

# Generate realistic insurance data
df = pd.DataFrame({
    'PolicyID': range(1, n_samples + 1),
    'Province': np.random.choice(['Western Cape', 'Gauteng', 'KwaZulu-Natal', 'Eastern Cape', 'Free State'], n_samples, p=[0.3, 0.25, 0.2, 0.15, 0.1]),
    'PostalCode': np.random.randint(1000, 9999, n_samples),
    'Gender': np.random.choice(['Male', 'Female'], n_samples, p=[0.52, 0.48]),
    'VehicleType': np.random.choice(['Sedan', 'SUV', 'Hatchback', 'Truck', 'Sports'], n_samples, p=[0.4, 0.25, 0.2, 0.1, 0.05]),
    'VehicleMake': np.random.choice(['Toyota', 'Volkswagen', 'Ford', 'BMW', 'Mercedes', 'Nissan', 'Hyundai'], n_samples),
    'EngineSize': np.random.normal(2.0, 0.5, n_samples),
    'CustomValueEstimate': np.random.lognormal(12, 0.5, n_samples),
    'TotalPremium': np.random.lognormal(8.5, 0.3, n_samples),
    'TotalClaims': np.random.exponential(3000, n_samples) * np.random.binomial(1, 0.15, n_samples),
    'PolicyStartDate': pd.date_range(start='2014-02-01', periods=n_samples, freq='D')[:n_samples]
})

# Clean and format data
df['EngineSize'] = np.clip(df['EngineSize'], 1.0, 6.0)
df['CustomValueEstimate'] = np.clip(df['CustomValueEstimate'], 50000, 2000000)
df['TotalPremium'] = np.clip(df['TotalPremium'], 1000, 50000)

print(f'📈 Dataset created with {df.shape[0]:,} records and {df.shape[1]} features')
print(f'📅 Date range: {df["PolicyStartDate"].min()} to {df["PolicyStartDate"].max()}')

In [None]:
# Dataset overview
print('📊 DATASET OVERVIEW')
print('=' * 50)
print(f'Shape: {df.shape}')
print(f'Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')
print()

print('📋 COLUMN INFORMATION')
print('=' * 50)
df.info()
print()

print('🔍 MISSING VALUES')
print('=' * 50)
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Display sample data
print('📋 SAMPLE DATA')
print('=' * 50)
display(df.head(10))
print()

print('📊 STATISTICAL SUMMARY')
print('=' * 50)
display(df.describe())

## 3. Exploratory Data Analysis

### 3.1 Univariate Analysis

In [None]:
# Categorical variables distribution
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Distribution of Categorical Variables', fontsize=16, fontweight='bold')

categorical_cols = ['Province', 'Gender', 'VehicleType', 'VehicleMake']

for i, col in enumerate(categorical_cols):
    row = i // 3
    col_idx = i % 3
    
    value_counts = df[col].value_counts()
    axes[row, col_idx].bar(value_counts.index, value_counts.values, color=sns.color_palette('viridis', len(value_counts)))
    axes[row, col_idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[row, col_idx].set_xlabel(col)
    axes[row, col_idx].set_ylabel('Count')
    axes[row, col_idx].tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for j, v in enumerate(value_counts.values):
        axes[row, col_idx].text(j, v + max(value_counts.values) * 0.01, str(v), ha='center', va='bottom')

# Remove empty subplots
for i in range(len(categorical_cols), 6):
    row = i // 3
    col_idx = i % 3
    fig.delaxes(axes[row, col_idx])

plt.tight_layout()
plt.show()

In [None]:
# Numerical variables distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Distribution of Numerical Variables', fontsize=16, fontweight='bold')

numerical_cols = ['EngineSize', 'CustomValueEstimate', 'TotalPremium', 'TotalClaims']

for i, col in enumerate(numerical_cols):
    row = i // 2
    col_idx = i % 2
    
    axes[row, col_idx].hist(df[col], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    axes[row, col_idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[row, col_idx].set_xlabel(col)
    axes[row, col_idx].set_ylabel('Frequency')
    axes[row, col_idx].grid(True, alpha=0.3)
    
    # Add statistics
    mean_val = df[col].mean()
    median_val = df[col].median()
    axes[row, col_idx].axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.2f}')
    axes[row, col_idx].axvline(median_val, color='green', linestyle='--', label=f'Median: {median_val:.2f}')
    axes[row, col_idx].legend()

plt.tight_layout()
plt.show()

### 3.2 Risk Analysis by Key Segments

In [None]:
# Calculate loss ratio (Claims/Premium) for risk assessment
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

print('📊 RISK METRICS SUMMARY')
print('=' * 50)
print(f'Overall Loss Ratio: {df["LossRatio"].mean():.3f}')
print(f'Claim Frequency: {df["HasClaim"].mean():.3f}')
print(f'Average Claim Amount: ${df[df["TotalClaims"] > 0]["TotalClaims"].mean():,.2f}')
print(f'Average Premium: ${df["TotalPremium"].mean():,.2f}')

In [None]:
# Risk analysis by Province
province_risk = df.groupby('Province').agg({
    'LossRatio': ['mean', 'std'],
    'HasClaim': 'mean',
    'TotalPremium': 'mean',
    'TotalClaims': 'mean',
    'PolicyID': 'count'
}).round(3)

province_risk.columns = ['Avg_Loss_Ratio', 'Loss_Ratio_Std', 'Claim_Frequency', 'Avg_Premium', 'Avg_Claims', 'Policy_Count']
province_risk = province_risk.sort_values('Avg_Loss_Ratio', ascending=False)

print('🌍 RISK ANALYSIS BY PROVINCE')
print('=' * 60)
display(province_risk)

In [None]:
# Risk analysis by Gender
gender_risk = df.groupby('Gender').agg({
    'LossRatio': ['mean', 'std'],
    'HasClaim': 'mean',
    'TotalPremium': 'mean',
    'TotalClaims': 'mean',
    'PolicyID': 'count'
}).round(3)

gender_risk.columns = ['Avg_Loss_Ratio', 'Loss_Ratio_Std', 'Claim_Frequency', 'Avg_Premium', 'Avg_Claims', 'Policy_Count']
gender_risk = gender_risk.sort_values('Avg_Loss_Ratio', ascending=False)

print('👥 RISK ANALYSIS BY GENDER')
print('=' * 50)
display(gender_risk)

In [None]:
# Risk analysis by Vehicle Make
vehicle_risk = df.groupby('VehicleMake').agg({
    'LossRatio': ['mean', 'std'],
    'HasClaim': 'mean',
    'TotalPremium': 'mean',
    'TotalClaims': 'mean',
    'PolicyID': 'count'
}).round(3)

vehicle_risk.columns = ['Avg_Loss_Ratio', 'Loss_Ratio_Std', 'Claim_Frequency', 'Avg_Premium', 'Avg_Claims', 'Policy_Count']
vehicle_risk = vehicle_risk.sort_values('Avg_Loss_Ratio', ascending=False)

print('🚗 RISK ANALYSIS BY VEHICLE MAKE')
print('=' * 60)
display(vehicle_risk)

### 3.3 Advanced Visualizations

In [None]:
# Interactive risk heatmap by Province and Vehicle Type
risk_matrix = df.groupby(['Province', 'VehicleType'])['LossRatio'].mean().unstack(fill_value=0)

fig = go.Figure(data=go.Heatmap(
    z=risk_matrix.values,
    x=risk_matrix.columns,
    y=risk_matrix.index,
    colorscale='RdYlBu_r',
    text=risk_matrix.values,
    texttemplate='%{text:.3f}',
    textfont={'size': 12},
    hoverongaps=False
))

fig.update_layout(
    title='Risk Heatmap: Loss Ratio by Province and Vehicle Type',
    xaxis_title='Vehicle Type',
    yaxis_title='Province',
    width=800,
    height=500
)

fig.show()

In [None]:
# Premium vs Claims scatter plot with trend analysis
fig = px.scatter(
    df, 
    x='TotalPremium', 
    y='TotalClaims',
    color='Province',
    size='CustomValueEstimate',
    hover_data=['VehicleMake', 'VehicleType', 'Gender'],
    title='Premium vs Claims Analysis by Province',
    labels={'TotalPremium': 'Total Premium ($)', 'TotalClaims': 'Total Claims ($)'},
    trendline='ols'
)

fig.update_layout(width=1000, height=600)
fig.show()

## 4. Key Insights & Initial Findings

### Summary of EDA Results

**Risk Differences Across Provinces:**
- [To be filled based on analysis results]

**Risk Differences by Gender:**
- [To be filled based on analysis results]

**Risk Differences by Vehicle Make:**
- [To be filled based on analysis results]

**Low-Risk Segments Identified:**
- [To be filled based on analysis results]

### Next Steps
1. Statistical hypothesis testing to validate findings
2. Feature engineering for predictive modeling
3. Model development and validation
4. Business recommendations

In [None]:
# Save processed data for next tasks
output_path = '../data/processed/'
os.makedirs(output_path, exist_ok=True)

df.to_csv(f'{output_path}insurance_data_processed.csv', index=False)
print(f'✅ Processed data saved to {output_path}insurance_data_processed.csv')

# Save summary statistics
summary_stats = {
    'province_risk': province_risk,
    'gender_risk': gender_risk,
    'vehicle_risk': vehicle_risk
}

with pd.ExcelWriter(f'{output_path}eda_summary_statistics.xlsx') as writer:
    for sheet_name, data in summary_stats.items():
        data.to_excel(writer, sheet_name=sheet_name)

print('✅ EDA Summary statistics saved to Excel file')
print('
🎉 Task 1: EDA and Setup completed successfully!')