In [1]:
#!pip install sagemaker

In [2]:
import sagemaker
import pandas as pd
import numpy as np
from platform import python_version
import zipfile

In [3]:
python_version(), np.__version__

('3.7.10', '1.16.5')

## The project: Predict travel insurance claims

We use the "Travel Insurance" dataset from Zahier Nasrudin, published on Kaggle. It contains data from a third-party insurance servicing company based in Singapore. The data contains information on travel insurance holders, some of the holder's attributes, and some attributes of the insurance products purchased by the holders. The target is a binary variable, stating whether a policyholder filed a claim against the insurance company. <br>
Link to data: https://www.kaggle.com/datasets/mhdzahier/travel-insurance

### Download data from Kaggle

(1) Download the authentication json file ('kaggle.json') from Kaggle & upload it to the notebook file directory <br>
(2) Run the following code in bash terminal to download the travel insurance dataset from Kaggle

In [4]:
# pip install kaggle
# mkdir ~/.kaggle
# cp kaggle.json ~/.kaggle/
# chmod 600 .kaggle/kaggle.json
# cd ml_eng_capstone
# kaggle datasets download -d mhdzahier/travel-insurance

### Data prep

In [5]:
with zipfile.ZipFile('travel-insurance.zip', 'r') as zip_ref:
    zip_ref.extractall()
travel_insurance = pd.read_csv('travel insurance.csv')

Describe data:

In [6]:
travel_insurance.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,,41


In [7]:
travel_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63326 entries, 0 to 63325
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency                63326 non-null  object 
 1   Agency Type           63326 non-null  object 
 2   Distribution Channel  63326 non-null  object 
 3   Product Name          63326 non-null  object 
 4   Claim                 63326 non-null  object 
 5   Duration              63326 non-null  int64  
 6   Destination           63326 non-null  object 
 7   Net Sales             63326 non-null  float64
 8   Commision (in value)  63326 non-null  float64
 9   Gender                18219 non-null  object 
 10  Age                   63326 non-null  int64  
dtypes: float64(2), int64(2), object(7)
memory usage: 5.3+ MB


Describe numerical values:

In [8]:
print('Duration:')
print(travel_insurance['Duration'].describe())
print()
print('Commision (in value):')
print(travel_insurance['Commision (in value)'].describe())
print()
print('Age:')
print(travel_insurance['Age'].describe())

Duration:
count    63326.000000
mean        49.317074
std        101.791566
min         -2.000000
25%          9.000000
50%         22.000000
75%         53.000000
max       4881.000000
Name: Duration, dtype: float64

Commision (in value):
count    63326.000000
mean         9.809992
std         19.804388
min          0.000000
25%          0.000000
50%          0.000000
75%         11.550000
max        283.500000
Name: Commision (in value), dtype: float64

Age:
count    63326.000000
mean        39.969981
std         14.017010
min          0.000000
25%         35.000000
50%         36.000000
75%         43.000000
max        118.000000
Name: Age, dtype: float64


Drop rows with negative duration:

In [9]:
len(travel_insurance[travel_insurance['Duration']<0])

5

In [10]:
index_neg_duration = travel_insurance[travel_insurance['Duration']<0].index
travel_insurance.drop(index_neg_duration, inplace=True)
travel_insurance = travel_insurance.reset_index().drop(labels='index', axis=1)

Replace NAs (only in Gender column) by string 'UNKNOWN'

In [11]:
travel_insurance.fillna('UNKNOWN',inplace=True)

In [12]:
##Remove rows with missing data:
#travel_insurance = travel_insurance.dropna()
#travel_insurance = travel_insurance.reset_index().drop(labels='index', axis=1)

Overview over data:

In [13]:
no_instances = travel_insurance.shape[0]
no_features = len(travel_insurance.columns) - 1
target_shares = round(travel_insurance['Claim'].value_counts()/len(travel_insurance),3)
print("No. of instances: " + f"{no_instances:,}")
print("No. of columns: " + str(no_features))
print("Share of targets: \n" + str(target_shares))
travel_insurance.head()

No. of instances: 63,321
No. of columns: 10
Share of targets: 
No     0.985
Yes    0.015
Name: Claim, dtype: float64


Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commision (in value),Gender,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,81
1,CBH,Travel Agency,Offline,Comprehensive Plan,No,186,MALAYSIA,-29.0,9.57,F,71
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,65,AUSTRALIA,-49.5,29.7,UNKNOWN,32
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,60,AUSTRALIA,-39.6,23.76,UNKNOWN,32
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,No,79,ITALY,-19.8,11.88,UNKNOWN,41


## Feature preparation

Recode target ('Claim') into numerical variable:

In [14]:
dict_label = {'Yes' : 1, 'No' : 0}
travel_insurance['Claim'] = travel_insurance['Claim'].replace(dict_label)

Replace categorical features through one-hot encoding:

In [15]:
def one_hot(df):
    #Function performs one-hot encoding with features of datatype object (string)
    #Last dummy column of each categorical is excluded to avoid perfect collinearity
    #NOTE: Categorical features already encoded as integers are NOT identified by this function!
    dtypes_ser = df.dtypes
    dtypes_df = dtypes_ser.to_frame().reset_index()
    dtypes_df = dtypes_df.rename(columns = {'index':'column', 0:'dtype'})
    categ_list = list(dtypes_df['column'][dtypes_df['dtype']=='object'])
    for feat in categ_list:
        one_hot = pd.get_dummies(df[feat], prefix=feat, drop_first=True)
        df = df.join(one_hot)
        df.drop(feat, inplace=True, axis=1)
    return df

In [16]:
travel_insurance = one_hot(travel_insurance)

In [17]:
travel_insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63321 entries, 0 to 63320
Columns: 197 entries, Claim to Gender_UNKNOWN
dtypes: float64(2), int64(3), uint8(192)
memory usage: 14.0 MB


Split data into label series and features dataframe:

In [18]:
Y = travel_insurance['Claim']
X = travel_insurance.drop(labels='Claim', axis=1)

In [19]:
X.shape, Y.shape

((63321, 196), (63321,))