# A/B Hypothesis Testing
**Objective**: Statistically validate or reject key hypotheses about risk drivers, which will form the basis of our new segmentation strategy.

In [1]:
import sys
import os
import pandas as pd
from glob import glob

In [2]:
from pathlib import Path
from importlib import reload
# add the project root to the path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [3]:
# In this task, we will prepare and clean the dataset for further analysis.
# First, we will load the module for data loading 
from src.data_loader import DataLoader
import src.data_loader
reload(src.data_loader)

<module 'src.data_loader' from 'D:\\Research & Project\\10academy\\week 3\\challenge\\insurisk-analytics-and-predictive-modeling\\src\\data_loader.py'>

In [4]:
raw_data_dir = '../data'
loader = DataLoader(raw_data_dir)
# Load the dataset
df = loader.load_data('MachineLearningRating_v3.txt')
# Display the first few rows of the dataset
df.head()

  self.data = pd.read_csv(file_path, sep="|")


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


### Data Cleaning and Handling Missing Values

In [6]:
# Loading modules  to perform preprocessing
from src.analysis.preprocessing import Preprocessing
import src.analysis.preprocessing
reload(src.analysis.preprocessing)
preprocessing = Preprocessing(df)

In [7]:
# drop the columns with more than 5% missing values
clean_df = preprocessing.preprocess()

Dropping columns with > 5.0% missing values:
['Bank', 'CustomValueEstimate', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet']



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[column].fillna(self.df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df[column].fillna(self.df[column].mode()[0], inplace=True)


In [8]:
# Calculate profit margin if not already present
if 'Margin' not in df.columns:
    clean_df['Margin'] = clean_df['TotalPremium'] - clean_df['TotalClaims']

In [9]:
clean_df['Margin'].describe()

count    817156.000000
mean         30.976067
std          81.921270
min       -2684.210526
25%           0.000000
50%           2.101930
75%          21.929825
max         635.480000
Name: Margin, dtype: float64

In [10]:
# Define KPIs
kpi_claims = 'TotalClaims'
kpi_margin = 'ProfitMargin'

### Accept or reject the following Null Hypotheses:

In [58]:
# import the modules
from src.hypotheses_testing import HypothesisTesting
import src.hypotheses_testing
reload(src.hypotheses_testing)


<module 'src.hypotheses_testing' from 'D:\\Research & Project\\10academy\\week 3\\challenge\\insurisk-analytics-and-predictive-modeling\\src\\hypotheses_testing.py'>

In [59]:
# instantiate the HypothesisTesting class
hypothesis_tester = HypothesisTesting(clean_df)

1. **H0**: There are no risk differences across provinces.

In [60]:
print("--- Running Hypothesis Tests ---")
# Run hypothesis tests
hypothesis_tester.test_province_risk()

--- Running Hypothesis Tests ---

--- Hypothesis: No risk differences across Provinces ---
Loss Ratio (Kruskal-Wallis): Stat=65.5597, P=0.0000
  --> Reject H₀ for Loss Ratio. Significant differences exist across provinces (p=0.0000).
  --> (Further post-hoc analysis needed to identify specific differing provinces).
Claim Frequency (Chi-squared): Stat=67.7716, P=0.0000
  --> Reject H₀ for Claim Frequency. Claim frequency is dependent on province (p=0.0000).


{'LossRatio_Kruskal': {'statistic': np.float64(65.55969101078352),
  'p_value': np.float64(3.740059252035797e-11)},
 'ClaimFrequency_Chi2': {'statistic': np.float64(67.77157643516094),
  'p_value': np.float64(1.3629288217970929e-11)}}

2. **H1**: There are no risk differences between zip codes.

In [61]:
# Hypothesis 2: Risk Differences Between Zip Codes
hypothesis_tester.test_zipcode_risk()


--- Hypothesis: No risk differences across Zip Codes ---
Loss Ratio (Kruskal-Wallis): Stat=932.8659, P=0.0001
  --> Reject H₀ for Loss Ratio. Significant differences exist across zip codes (p=0.0001).
  --> (Further post-hoc analysis needed to identify specific differing zip codes).
Claim Frequency (Chi-squared): Stat=1171.6264, P=0.0000
  --> Reject H₀ for Claim Frequency. Claim frequency is dependent on zip code (p=0.0000).


{'LossRatio_Kruskal': {'statistic': np.float64(932.8658889722387),
  'p_value': np.float64(0.00011472666205091963)},
 'ClaimFrequency_Chi2': {'statistic': np.float64(1171.6263842462238),
  'p_value': np.float64(6.97724075135732e-14)}}