# VICA Technical Assessment - Task 2

Name of candidate: Chan Choon Kong

## Import libraries

In [14]:
# !pip install pymongo
import os
import pymongo
import pandas as pd
import numpy as np
from pymongo import MongoClient
from collections import OrderedDict

## Set up database
### Making a connection with MongoClient

In [6]:
client = MongoClient('localhost', 27017)
db = client['test']
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'test')

### Create collection for data

In [7]:
# if "insuranceData" not in db.list_collection_names():
#     db.create_collection("insuranceData")
db.create_collection("insuranceData")
db.list_collection_names()

['insuranceData']

### Define schema validation rules

In [8]:
schema = {
    "$jsonSchema": {
        "bsonType": "object",
        "required": ["insuree#", "isMarried", "hasKids", "insuredMonths", "termLifeInsurance", "healthInsurance", "eStatements", "monthlyPremium", "renewal"],
        "properties": {
            "insuree#": {
                "bsonType": "int",
                "description": "ID  of the insuree"
            },
            "gender": {
                "enum": ["M", "F"],
                "description": "M or F"
            },
            "is45OrOlder": {
                "bsonType": "bool",
                "description": "Is insuree >= 45 years old?"
            },
            "isMarried": {
                "bsonType": "bool",
                "description": "Is insuree married?"
            },
            "hasKids": {
                "bsonType": "bool",
                "description": "Does the insuree have kids?"
            },
            "insuredMonths": {
                "bsonType": "int",
                "description": "Months of active insurance"
            },
            "termLifeInsurance": {
                "bsonType": "object",
                "required": ["hasPolicy", "hasMultiplePolicies"],
                "properties": {
                    "hasPolicy": {
                        "bsonType": "bool",
                        "description": "Does insuree have term life policy?"
                    },
                    "hasMultiplePolicies": {
                        "bsonType": "bool",
                        "description": "Does insuree hold multiple term life policies?"
                    }
                }
            },
            "healthInsurance": {
                "bsonType": "object",
                "required": ["hasPolicy", "riders"],
                "properties": {
                    "hasPolicy": {
                        "bsonType": "bool",
                        "description": "Does insuree have health insurance?"
                    },
                    "riders": {
                        "bsonType": "array",
                        "items": {
                            "enum": [1, 2, 3, 4]
                        },
                        "description": "Does insuree have riders on this health policy?"
                    }
                }
            },
            "premiumFrequency": {
                "enum": [1, 3, 12],
                "description": "Premium due monthly, quarterly, annually"
            },
            "eStatements": {
                "bsonType": "bool",
                "description": "Opted in for e-statements and e-policies?"
            },
            "monthlyPremium": {
                "bsonType": "double",
                "description": "Premium amount monthly"
            },
            "totalPremium": {
                "bsonType": "double",
                "description": "Total premium amount"
            },
            "renewal": {
                "bsonType": "bool",
                "description": "Does insuree renew at next premium cycle?"
            }
        }
    }
}

cmd = OrderedDict([('collMod', 'insuranceData'), ('validator', schema)])
db.command(cmd)

{'ok': 1.0}

## Load dataset

In [20]:
dataset_dir = os.path.join(os.curdir, 'mol-vica-ds-challenge-dataset', 'insurance_data.csv')

# Define converters
float_to_bool = lambda x: x == '1.0' if x else None
yesno_to_bool = lambda x: (x == 'Yes' or x == 'Y') if x else None
csfloat_to_dec = lambda x: float(x.replace(',', '.')) if x.replace(',', '.').strip() else np.nan
csint_to_lst = lambda x: list(map(lambda y: int(y), x.split(','))) if x.split(',')[0] else []

df = pd.read_csv(dataset_dir, sep=';', converters={
    'is45OrOlder': float_to_bool, 
    'isMarried': yesno_to_bool, 
    'hasKids': yesno_to_bool, 
    'termLifeInsurance': yesno_to_bool, 
    'multipleTermLifePolicies': yesno_to_bool,
    'eStatements': yesno_to_bool,
    'renewal': yesno_to_bool,
    'healthRiders': csint_to_lst
})

# Note: Int64 writes <NA> for empty integer fields
df['premiumFrequency'] = df['premiumFrequency'].astype('Int64')
df['monthlyPremium'] = df['monthlyPremium'].apply(csfloat_to_dec)
df['totalPremium'] = df['totalPremium'].apply(csfloat_to_dec)
df.head(5)

Unnamed: 0,insuree#,gender,is45OrOlder,isMarried,hasKids,insuredMonths,termLifeInsurance,multipleTermLifePolicies,healthInsurance,healthRiders,premiumFrequency,eStatements,monthlyPremium,totalPremium,renewal
0,1,F,False,True,True,23,True,False,No,[],12,True,19.65,451.55,True
1,2,F,True,False,False,42,True,True,Class A,[3],1,True,84.65,3541.35,False
2,3,F,False,True,False,72,True,False,No,[],12,False,19.4,1496.45,True
3,4,F,False,True,True,13,True,False,No,[],12,False,19.55,265.3,True
4,5,F,False,False,False,37,True,True,Class A,"[3, 4]",1,False,100.3,3541.4,True


In [34]:
cols_missing = df.columns[df.isnull().any()].tolist()
print("Columns with missing inputs: {0}".format(cols_missing))

Columns with missing inputs: ['gender', 'is45OrOlder', 'premiumFrequency', 'totalPremium']


In [38]:
def process_row_record(dic):
    dic['termLifeInsurance'] = {
        'hasPolicy': dic['termLifeInsurance'],
        'hasMultiplePolicies': dic['multipleTermLifePolicies']
    }
    dic['healthInsurance'] = {
        'hasPolicy': dic['healthInsurance'] != 'No',
        'riders': dic['healthRiders']
    }

    del dic['multipleTermLifePolicies']
    del dic['healthRiders']
    for col in cols_missing:
        if pd.isna(dic[col]):
            del dic[col]
    return dic

## Push data into MongoDB

In [39]:
for dic in df.to_dict(orient='records'):
    dic = process_row_record(dic)
    # Insert into database
    db.insuranceData.insert_one(dic)

In [48]:
db.insuranceData.find_one({'premiumFrequency': {"$exists": False}})

{'_id': ObjectId('62f65c4cde12319ef35fc315'),
 'insuree#': 21,
 'gender': 'F',
 'is45OrOlder': True,
 'isMarried': False,
 'hasKids': False,
 'insuredMonths': 4,
 'termLifeInsurance': {'hasPolicy': True, 'hasMultiplePolicies': True},
 'healthInsurance': {'hasPolicy': True, 'riders': []},
 'eStatements': True,
 'monthlyPremium': 74.45,
 'totalPremium': 294.45,
 'renewal': True}