### A/B Hypothesis Testing

In [1]:
import os
import sys
import pandas as pd
import scipy.stats as stats #It has all the probability distributions available along with many statistical functions.
# sns.set(style="darkgrid") # set the background for the graphs
# Get the current working directory
current_dir = os.getcwd()

# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
from scripts.AB_haypothesis_tester import ABHypothesisTester

In [3]:
#Reading the txt file MachineLearningRating_v3.txt
file_path = '../data/MachineLearningRating_v3.txt'
df=pd.read_csv(file_path, delimiter='|')

In [4]:
df.head(5)

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [5]:
df.columns

Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet',
       'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')

In [6]:
ab_tester=ABHypothesisTester(df)

## Hypotheses to Test
### 1. No Risk Differences Across Provinces:

- ***Null Hypothesis (H0)***: There are no significant risk differences between provinces.
- **Alternative Hypothesis (H1)**: There are significant risk differences between provinces.

In [7]:
# Define the provinces for each group
control_provinces = ['Gauteng', 'Western Cape', 'KwaZulu-Natal']
test_provinces = ['Eastern Cape', 'Mpumalanga', 'Limpopo', 'North West', 'Free State', 'Northern Cape']

In [8]:
# 1. Risk Differences Across Provinces
group_A, group_B = ab_tester.create_groups(df, 'Province', control_provinces, test_provinces)
p_value_provinces = ab_tester.hypothesis_test(group_A, group_B, 'TotalClaims', test_type='t')

In [9]:
p_value_provinces

np.float64(9.13223703854227e-10)

In [10]:
# Reporting for Risk Differences Across Provinces
ab_tester.report_results(p_value_provinces, "Risk Differences Across Provinces")

Risk Differences Across Provinces: p-value = 0.0000 -> Reject the null hypothesis


### A/B hypotesis testing using ZipCode

In [10]:
# 2. Risk Differences Between Zip Codes
group_A, group_B = ab_tester.create_groups(df, 'ZipCode', 'ZipCode_A', 'ZipCode_B')
p_value_zipcodes = ab_tester.hypothesis_test(group_A, group_B, 'Risk', test_type='t')

KeyError: 'ZipCode'

In [14]:
# 3. Margin (Profit) Differences Between Zip Codes
group_A, group_B = ab_tester.create_groups(df, 'ZipCode', 'ZipCode_A', 'ZipCode_B')
p_value_margin = ab_tester.hypothesis_test(group_A, group_B, 'Margin', test_type='t')

KeyError: 'ZipCode'

#### AB hypotesis testing using Gender

##### 4. No Significant Risk Differences Between Women and Men:

- **Null Hypothesis (H0)**: There are no significant risk differences between women and men.
- **Alternative Hypothesis (H1)**: There are significant risk differences between women and men.

In [15]:
# Data Cleaning
df['Gender'] = df['Gender'].replace({'Not specified': 'Unknown'})
df = df.dropna(subset=['Gender'])

In [16]:
# 4. Risk Differences Between Women and Men
group_A, group_B = ab_tester.create_gender_groups(df)
# group_A, group_B = ab_tester.create_groups(df, 'Gender', 'Female', 'Male')
p_value_gender = ab_tester.hypothesis_test(group_A, group_B, 'TotalClaims', test_type='t')

In [17]:
ab_tester.report_results(p_value_gender, "Risk Differences Between Genders")

Risk Differences Between Genders: p-value = 0.8041 -> Fail to reject the null hypothesis
