# VICA Technical Assessment - Task 1

Name of candidate: Chan Choon Kong

## Import libraries

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Load dataset

The dataset is loaded into a pandas DataFrame. The data types of each column of the DataFrame are as follows:

- `insuree#: int`
- `gender: str`
- `is45OrOlder: bool`
- `hasKids: bool`
- `insuredMonths: int`
- `termLifeInsurance: bool`
- `multipleTermLifePolicies: bool`
- `healthInsurance: str`
- `healthRiders: List[int]`
- `premiumFrequency: int`
- `eStatements: bool`
- `monthlyPremium: float`
- `totalPremium: float`
- `renewal: bool`

Simple preprocessing is done to convert the original data into the data types above. Empty values are preserved and will be handled during the data preprocessing phase afterwards.

In [2]:
dataset_dir = os.path.join(os.curdir, 'mol-vica-ds-challenge-dataset', 'insurance_data.csv')

# Define converters
float_to_bool = lambda x: x == '1.0' if x else None
yesno_to_bool = lambda x: (x == 'Yes' or x == 'Y') if x else None
csfloat_to_dec = lambda x: float(x.replace(',', '.')) if x.replace(',', '.').strip() else np.nan
csint_to_lst = lambda x: list(map(lambda y: int(y), x.split(','))) if x.split(',')[0] else []

df = pd.read_csv(dataset_dir, sep=';', converters={
    'is45OrOlder': float_to_bool, 
    'isMarried': yesno_to_bool, 
    'hasKids': yesno_to_bool, 
    'termLifeInsurance': yesno_to_bool, 
    'multipleTermLifePolicies': yesno_to_bool,
    'eStatements': yesno_to_bool,
    'renewal': yesno_to_bool,
    'healthRiders': csint_to_lst
})

# Note: Int64 writes <NA> for empty integer fields
df['premiumFrequency'] = df['premiumFrequency'].astype('Int64')
df['monthlyPremium'] = df['monthlyPremium'].apply(csfloat_to_dec)
df['totalPremium'] = df['totalPremium'].apply(csfloat_to_dec)
df.head(5)

Unnamed: 0,insuree#,gender,is45OrOlder,isMarried,hasKids,insuredMonths,termLifeInsurance,multipleTermLifePolicies,healthInsurance,healthRiders,premiumFrequency,eStatements,monthlyPremium,totalPremium,renewal
0,1,F,False,True,True,23,True,False,No,[],12,True,19.65,451.55,True
1,2,F,True,False,False,42,True,True,Class A,[3],1,True,84.65,3541.35,False
2,3,F,False,True,False,72,True,False,No,[],12,False,19.4,1496.45,True
3,4,F,False,True,True,13,True,False,No,[],12,False,19.55,265.3,True
4,5,F,False,False,False,37,True,True,Class A,"[3, 4]",1,False,100.3,3541.4,True


## Data Processing

### Validity of Data

I will check if there are any rows with invalid data. In this task, I assume that the following criteria implies that the data is invalid.
- Insuree who has multiple term life policies but not a term life policy
- Insuree who does not have term life insurance or health insurance
- Insuree who does not have health insurance but has riders
- Insuree whose total premium is less than their monthly premium

In [None]:
invalid_df = df.loc[(df['multipleTermLifePolicies'] == True) & (df['termLifeInsurance'] == False)]
invalid_df = pd.concat([invalid_df, df.loc[(df['termLifeInsurance'] == False) & (df['healthInsurance'] == 'No')]])
invalid_df = pd.concat([invalid_df, df.loc[(df['healthInsurance'] == False) & (df['healthRiders'].astype(bool) == True)]])
invalid_df = pd.concat([invalid_df, df.loc[df['totalPremium'] < df['monthlyPremium']]])
if invalid_df.shape[0] != 0:
    print("Found {0} invalid row: ".format(invalid_df.shape[0]))
else:
    print("Data is valid")
invalid_df

Based on the criteria above, I found 1 invalid row where the total premium exceeds the monthly premium. Therefore this row will be removed from the dataset.

In [None]:
to_drop = invalid_df.index.values.tolist()
df = df.drop(to_drop)