# Exploratory Data Analysis for Insurance Premium Prediction

This notebook performs a comprehensive exploratory data analysis on the insurance premium dataset to understand the data characteristics, identify patterns, and inform feature engineering and modeling decisions.

**Author:** Erick K. Yegon, PhD (keyegon@gmail.com)

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression

# Set plot style
plt.style.use('seaborn-whitegrid')
sns.set_palette('viridis')
warnings.filterwarnings('ignore')

# Add the parent directory to the path so we can import the package
sys.path.append(os.path.abspath('..'))

## 1. Data Loading and Initial Inspection

In [None]:
# Load the data
data_path = '../data/premiums.xlsx'
df = pd.read_excel(data_path)

# Rename columns with spaces to use underscores for consistency
if 'Number Of Dependants' in df.columns:
    df = df.rename(columns={'Number Of Dependants': 'Number_Of_Dependants'})

if 'Medical History' in df.columns:
    df = df.rename(columns={'Medical History': 'Medical_History'})

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")
print(f"\nColumns: {df.columns.tolist()}")

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Get data types and summary statistics
print("Data Types:")
print(df.dtypes)

print("\nSummary Statistics:")
df.describe(include='all').T

## 2. Data Quality Assessment

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

print("Missing Values Analysis:")
print(missing_df[missing_df['Missing Values'] > 0])

# Visualize missing values if any
if missing_values.sum() > 0:
    plt.figure(figsize=(12, 6))
    plt.bar(missing_df.index, missing_df['Percentage'])
    plt.title('Percentage of Missing Values by Feature')
    plt.xlabel('Features')
    plt.ylabel('Percentage Missing')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset.")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print("Sample of duplicate rows:")
    df[df.duplicated(keep='first')].head()

In [None]:
# Check for outliers in numerical features
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns

# Function to detect outliers using IQR method
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound, len(outliers)

# Function to detect outliers using Z-score method
def detect_outliers_zscore(df, column, threshold=3):
    z_scores = np.abs(stats.zscore(df[column]))
    outliers = df[z_scores > threshold]
    return outliers, len(outliers)

# Display outlier information for numerical features
print("Outlier Analysis:")
for column in numerical_features:
    if column != 'Annual_Premium_Amount':  # Skip the target variable
        _, lower, upper, count_iqr = detect_outliers_iqr(df, column)
        _, count_zscore = detect_outliers_zscore(df, column)
        print(f"\n{column}:")
        print(f"  IQR Method: {count_iqr} outliers (bounds: {lower:.2f}, {upper:.2f})")
        print(f"  Z-score Method: {count_zscore} outliers (threshold: 3)")

In [None]:
# Visualize outliers using box plots
plt.figure(figsize=(15, 10))
for i, column in enumerate(numerical_features):
    if column != 'Annual_Premium_Amount':  # Skip the target variable for now
        plt.subplot(2, 3, i+1)
        sns.boxplot(y=df[column])
        plt.title(f'Boxplot of {column}')
plt.tight_layout()
plt.show()

## 3. Distribution Analysis

In [None]:
# Analyze distributions of numerical features
plt.figure(figsize=(15, 12))
for i, column in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')
    
    # Add normality test results
    if len(df[column]) > 3:  # Need at least 3 samples for normality test
        stat, p = stats.shapiro(df[column].sample(min(5000, len(df[column]))))
        plt.annotate(f'Shapiro-Wilk: p={p:.4f}', xy=(0.05, 0.95), xycoords='axes fraction')
plt.tight_layout()
plt.show()

In [None]:
# QQ plots for numerical features to check normality
plt.figure(figsize=(15, 12))
for i, column in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)
    qqplot(df[column], line='s', ax=plt.gca())
    plt.title(f'QQ Plot of {column}')
plt.tight_layout()
plt.show()

In [None]:
# Analyze distributions of categorical features
categorical_features = df.select_dtypes(include=['object']).columns

for column in categorical_features:
    plt.figure(figsize=(12, 6))
    value_counts = df[column].value_counts()
    
    # Bar plot
    plt.subplot(1, 2, 1)
    sns.countplot(y=column, data=df, order=value_counts.index)
    plt.title(f'Count of {column}')
    plt.xlabel('Count')
    
    # Pie chart
    plt.subplot(1, 2, 2)
    plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%')
    plt.title(f'Percentage of {column}')
    
    plt.tight_layout()
    plt.show()
    
    # Print category counts and percentages
    print(f"\n{column} - Category Distribution:")
    category_df = pd.DataFrame({
        'Count': value_counts,
        'Percentage': (value_counts / len(df)) * 100
    })
    print(category_df)

## 4. Target Variable Analysis

In [None]:
# Analyze the target variable (Annual_Premium_Amount)
target = 'Annual_Premium_Amount'

plt.figure(figsize=(15, 5))

# Histogram
plt.subplot(1, 3, 1)
sns.histplot(df[target], kde=True)
plt.title(f'Distribution of {target}')

# Box plot
plt.subplot(1, 3, 2)
sns.boxplot(y=df[target])
plt.title(f'Boxplot of {target}')

# Log transformation to check if it normalizes the distribution
plt.subplot(1, 3, 3)
sns.histplot(np.log1p(df[target]), kde=True)
plt.title(f'Log Distribution of {target}')

plt.tight_layout()
plt.show()

# Summary statistics for the target variable
print(f"\n{target} - Summary Statistics:")
target_stats = df[target].describe()
print(target_stats)

# Additional statistics
print(f"\nSkewness: {df[target].skew():.4f}")
print(f"Kurtosis: {df[target].kurtosis():.4f}")

# Normality test
stat, p = stats.shapiro(df[target].sample(min(5000, len(df[target]))))
print(f"Shapiro-Wilk Test: statistic={stat:.4f}, p-value={p:.4f}")
if p < 0.05:
    print("The target variable is not normally distributed.")
else:
    print("The target variable appears to be normally distributed.")

## 5. Relationship Analysis

In [None]:
# Correlation analysis for numerical features
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Pairplot for numerical features
numerical_sample = df[numerical_features].sample(min(1000, len(df)))
sns.pairplot(numerical_sample)
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

In [None]:
# Relationship between categorical features and target
for column in categorical_features:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=column, y=target, data=df)
    plt.title(f'Relationship between {column} and {target}')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()
    
    # ANOVA test to check if the means are significantly different
    groups = [df[df[column] == category][target] for category in df[column].unique()]
    f_stat, p_value = stats.f_oneway(*groups)
    print(f"ANOVA test for {column}: F-statistic={f_stat:.4f}, p-value={p_value:.4f}")
    if p_value < 0.05:
        print(f"The mean {target} is significantly different across {column} categories.\n")
    else:
        print(f"No significant difference in mean {target} across {column} categories.\n")

In [None]:
# Relationship between numerical features and target
for column in numerical_features:
    if column != target:
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=column, y=target, data=df, alpha=0.5)
        plt.title(f'Relationship between {column} and {target}')
        
        # Add regression line
        sns.regplot(x=column, y=target, data=df, scatter=False, color='red')
        
        # Calculate correlation
        corr, p = stats.pearsonr(df[column], df[target])
        plt.annotate(f'Pearson r: {corr:.4f} (p={p:.4f})', xy=(0.05, 0.95), xycoords='axes fraction')
        
        plt.tight_layout()
        plt.show()

## 6. Feature Importance Analysis

In [None]:
# Calculate mutual information for numerical features
X_numeric = df[numerical_features].drop(columns=[target])
y = df[target]

# Calculate mutual information
mi_scores = mutual_info_regression(X_numeric, y)
mi_df = pd.DataFrame({'Feature': X_numeric.columns, 'Mutual Information': mi_scores})
mi_df = mi_df.sort_values('Mutual Information', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Mutual Information', y='Feature', data=mi_df)
plt.title('Feature Importance (Mutual Information)')
plt.tight_layout()
plt.show()

In [None]:
# One-hot encode categorical features for further analysis
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Calculate VIF for numerical features to check multicollinearity
X_numeric = df[numerical_features].drop(columns=[target])
vif_data = pd.DataFrame()
vif_data['Feature'] = X_numeric.columns
vif_data['VIF'] = [variance_inflation_factor(X_numeric.values, i) for i in range(X_numeric.shape[1])]

print("Variance Inflation Factor (VIF) for Numerical Features:")
print(vif_data.sort_values('VIF', ascending=False))

## 7. Dimensionality Reduction and Visualization

In [None]:
# PCA for visualization
# Prepare data for PCA (standardize and include only numerical features)
X_for_pca = df[numerical_features].drop(columns=[target])
X_scaled = StandardScaler().fit_transform(X_for_pca)

# Apply PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Add target variable for coloring
pca_df['Annual_Premium_Amount'] = df[target]

# Visualize PCA results
plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Annual_Premium_Amount', data=pca_df, palette='viridis', alpha=0.7)
plt.title('PCA of Numerical Features')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
plt.colorbar(label='Annual Premium Amount')
plt.tight_layout()
plt.show()

# Print explained variance ratio
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.2%}")

## 8. Key Insights and Recommendations

### Key Insights from EDA:

1. **Data Quality**:
   - The dataset contains [X] samples and [Y] features
   - Missing values: [summary of missing values findings]
   - Outliers: [summary of outlier analysis]

2. **Feature Distributions**:
   - Numerical features: [summary of distribution analysis]
   - Categorical features: [summary of category distributions]
   - Target variable: [summary of target distribution]

3. **Relationships**:
   - Strong correlations between [feature pairs]
   - Significant relationships between [categorical features] and the target
   - Most important features based on mutual information: [top features]

4. **Potential Issues**:
   - Multicollinearity between [features]
   - Skewed distributions in [features]
   - Class imbalance in [categorical features]

### Recommendations for Data Preprocessing:

1. **Handling Missing Values**:
   - [Specific recommendations based on findings]

2. **Outlier Treatment**:
   - [Specific recommendations based on findings]

3. **Feature Transformations**:
   - Apply log transformation to the target variable to normalize its distribution
   - Apply appropriate transformations to skewed numerical features

4. **Feature Engineering Opportunities**:
   - Create interaction terms between [features]
   - Develop composite risk scores based on [features]
   - Bin continuous variables like Age into meaningful categories

5. **Feature Selection**:
   - Consider removing highly correlated features
   - Focus on features with high mutual information scores

These insights and recommendations will guide our feature engineering and modeling approaches in the subsequent notebooks.