# 0.0. Mission

## Context

<p> Alpha Insurance is a company that provides health insurance to their customers. With the goal of improving <br> their revenue the product team is thinking about 
the possibility of offering car insurance to its customers.</p>

<p>The enterprise has made a research with over 380,000 customers to know their intention
in purchasing the new <br> service. The product team has selected another 127,000 customers
that the sales team will try to sell the new service. </p>
    
<p>As the sales team cannot get in touch with all of the selected customers during the campaign time,
they need to <br> give priority to those clients that are more likely to purchase the service. So 
they need a data scientist to select the <br> 20,000 more interested customers. </p>

## Challenge

<b>Select the 20,000 customers, that are more interested in buying the car insurance.</b>

# 1.0. Preparation Process

## 1.1. IMPORTS

In [None]:
import math
import datetime
import inflection
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display         import Image
import matplotlib.patches as mpatches
from scipy                   import stats as ss
from sklearn.preprocessing   import RobustScaler, MinMaxScaler, LabelEncoder
from boruta                  import BorutaPy
from sklearn.metrics         import mean_absolute_error, mean_squared_error
from sklearn.ensemble        import RandomForestClassifier
from sklearn.linear_model    import LinearRegression, Lasso
import xgboost as xgb


###--------- PANDAS - EXIBIR TODAS COLUNAS ----###

pd.set_option('display.max_columns', None)


###--------- ESTILIZAÇÃO DO NOTEBOOK ---------###

from IPython.core.display import display, HTML

# retira a margem do notebook
display(HTML("<style>.container { width:100% !important; margin-left:0px !important }</style>"))
# font do texto markdown
display(HTML("<style>h1 { font-size:23px !important; }</style>"))
display(HTML("<style>h2 { font-size:20px !important; }</style>"))
display(HTML("<style>h3 { font-size:17px !important; }</style>"))
display(HTML("<style>h4 { font-size:16px !important; }</style>"))
display(HTML("<style>p { font-size:16px !important; }</style>"))

# tamanho da fonte da tabela
display(HTML("<style>th { font-size:15px !important; }</style>"))
display(HTML("<style>td { font-size:15px !important; }</style>"))

# font do codigo 
display(HTML("<style>span { font-size:16px !important; }</style>"))

## 1.2. Helper Functions

### Cramer's V

In [None]:
def cramer_v( var1, var2 ):
    cm = pd.crosstab( var1, var2 ).to_numpy()
    n = cm.sum()
    r, k = cm.shape
    
    chi2 = ss.chi2_contingency( cm )[0]
    chi2corr = max(0, (chi2/n) - (k-1)*(r-1)/(n-1) )
    
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt( chi2corr / min( kcorr-1, rcorr-1 ) )

### Measuring Errors

In [None]:
def accuracy(y, yhat):
    hits = 0
    for i in range(len(y)):
        hits = hits + int(y[i] == yhat[i])
    return hits / len(y)

def calc_f1_score(y, yhat):
    tp = fp = fn = 0
    for i in range(len(y)):
        # true positive
        if (y[i] == 1) and (yhat[i] == 1) :
            tp = tp + 1
        # false positive
        elif (y[i] == 0) and (yhat[i] == 1) :
            fp = fp + 1
        # false negative
        elif (y[i] == 1) and (yhat[i] == 0) :
            fn = fn + 1
        
    if (tp + fp) > 0 :
        precision = tp / (tp + fp)
    else:
        precision = 0

    if (tp + fn) > 0 :
        recall = tp / (tp + fn)
    else:
        recall = 0
    
    if (precision + recall) > 0 :
        f1_score = 2*precision*recall / (precision + recall)
    else:
        f1_score = 0
        
    return f1_score

def ml_error( model_name, y, yhat ):
    accuracy_score = accuracy(y, yhat)
    f1_score = calc_f1_score(y, yhat)

    return pd.DataFrame( { 'Model Name': model_name,
                           'Accuracy': accuracy_score,
                           'F1-Score': f1_score }, index=[0] )

### Cross Validation

In [None]:
def cross_val_err( x_train, kfold, model_name, model, verbose=False ):
    
    mae_list = []
    mape_list = []
    rmse_list = []
    x_training = x_train[ cols_selected_boruta ]

    for k in reversed( range(1, kfold+1) ):
        
        if verbose:
            print('\nKfold number: ' + str(k))
        # start and end date
        validation_start_date = x_training['date'].max() - datetime.timedelta( days= 7 * 6 * k )
        validation_end_date = x_training['date'].max() - datetime.timedelta( days= 7 * 6 * (k-1) )

        # filtering dataset
        training = x_training[ (x_training['date'] < validation_start_date) ]
        validation = x_training[ (x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date) ]

        # training dataset
        xtraining = training.drop( ['date', 'sales'], axis=1 )
        ytraining = training['sales']

        # validation dataset
        xvalidation = validation.drop( ['date', 'sales'], axis=1 )
        yvalidation = validation['sales']

        # model
        md = model.fit( xtraining, ytraining )

        # prediction
        yhat_md = md.predict( xvalidation )

        # performance
        resp_error = ml_error(model_name, np.expm1( yvalidation ), np.expm1( yhat_md ) )

        # adding the error to the list
        mae_list.append( resp_error['MAE'].values )
        mape_list.append( resp_error['MAPE'].values )
        rmse_list.append( resp_error['RMSE'].values )
        
    return pd.DataFrame({ 'Model name': model_name,
                          'MAE CV' : np.round( np.mean( np.array( mae_list ) ), 2).astype(str) + ' +/- ' + np.round( np.std( np.array( mae_list ) ) ).astype(str),
                          'MAPE CV': np.round( np.mean( np.array( mape_list ) ), 2).astype(str) + ' +/- ' + np.round( np.std( np.array( mape_list ) ), decimals=4 ).astype(str),
                          'RMSE CV': np.round( np.mean( np.array( rmse_list ) ), 2).astype(str) + ' +/- ' + np.round( np.std( np.array( rmse_list ) ) ).astype(str)
                         }, index=[0])

### CATEGORICAL CHART CREATOR

In [None]:
# categories - list containing all the categories of the feature
# text_list - list containing the text behind each bar of the plot
def createCategoryPlots(fig_width, fig_height, df, feature_name, title1, title2, categories, text_list):
    plt.figure(figsize=(fig_width, fig_height))
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)
    
    # plot a countplot
    plt.subplot( 1, 2, 1 )
    plt.title( title1 )
    sns.countplot( df[feature_name] )
    
    percentage_list = []
    
    for i in range( len( categories ) ):
        feature_num_accept = df[ ( df['response'] == 1 ) & ( df[feature_name] == categories[i] ) ].count()[0]
        feature_num_reject = df[ ( df['response'] == 0 ) & ( df[feature_name] == categories[i] ) ].count()[0]
        feature_perc_accept = feature_num_accept / (feature_num_accept + feature_num_reject)
        percentage_list.append( feature_perc_accept )
    
    plt.subplot( 1, 2, 2 )
    plt.title( title2 )
    plt.ylabel( 'Percentage' )
    plt.xlabel( feature_name )
    sns.barplot( x = [ text_list[i].format( np.round ( 100 * percentage_list[i], 3 ) ) for i in range(len(categories))],
                 y = percentage_list )

In [None]:
# categories - list containing all the categories of the feature
# text_list - list containing the text behind each bar of the plot
def createCategoryPlots2(fig_width, fig_height, df, feature_name, title1, title2, categories, text_list):
    plt.figure(figsize=(fig_width, fig_height))
    plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)
    
    # plot a countplot
    plt.subplot( 1, 2, 1 )
    plt.title( title1 )
    sns.countplot( df[feature_name] )
    
    percentage_list = []

    num_accept = df[ df['response'] == 1 ].count()[0]

    for i in range( len( categories ) ):
        feature_num_accept = df[ ( df['response'] == 1 ) & ( df[feature_name] == categories[i] ) ].count()[0]
        feature_perc_accept = feature_num_accept / num_accept
        percentage_list.append( feature_perc_accept )
    
    plt.subplot( 1, 2, 2 )
    plt.title( title2 )
    plt.ylabel( 'Percentage' )
    plt.xlabel( feature_name )
    sns.barplot( x = [ text_list[i].format( np.round ( 100 * percentage_list[i], 3 ) ) for i in range(len(categories))],
                 y = percentage_list )

## 1.3. Loading Data

In [None]:
train_raw_data = pd.read_csv('data/train.csv', low_memory=False)
test_raw_data = pd.read_csv('data/test.csv', low_memory=False)

In [None]:
train_raw_data.sample()

# 2.0. Data Description

In [None]:
df2 = train_raw_data.copy()

## 2.1. Rename Columns

In [None]:
cols_old = ['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
               'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
               'Policy_Sales_Channel', 'Vintage', 'Response']
cols_old

In [None]:
snakecase = lambda x: inflection.underscore( x )

cols_new = list( map( snakecase, cols_old ) )

df2.columns = cols_new

## 2.2. Data Dimension

In [None]:
print('The number of rows is: {}'.format( df2.shape[0] ))
print('The number of columns is: {}'.format( df2.shape[1] ))

## 2.3. Data Types

In [None]:
df2.dtypes

## 2.4. Check NA

In [None]:
df2.isna().sum()

## 2.5. Change Types

In [None]:
# Check if can convert region_code from float to int
aux1 = np.array( df2['region_code'] )
aux2 = aux1.astype(int)
np.sum( aux1 - aux2 )

# Converting region_code from float to int
df2['region_code'] = df2['region_code'].astype(int)

### vehicle_age

vehicle_age is an object, we need to convert it to float. 
    
 It's necessary to assume numerical values for the:
3 options

< 1 Year  -->   0.5,  we assumed the mean value
    
1-2 Year  -->   1.5,  we assumed the mean value

\> 2 Year  -->   7.5, it's difficult to consider some mean value, so lets consider a number 5 times bigger than the previous


In [None]:
# Changing categories' names of vehicle age
df2['vehicle_age'] = df2['vehicle_age'].apply(lambda x: 'new' if x == '< 1 Year' else 'almost_new' if x =='1-2 Year' else 'old')

# Convert annual_premium from float to int
df2['annual_premium'] = df2['annual_premium'].astype(int)

# Convert policy_sales_channel from float to int
df2['policy_sales_channel'] = df2['policy_sales_channel'].astype(int)

# Convert vehicle_damage from object to int
df2['vehicle_damage'] = df2['vehicle_damage'].apply(lambda x: 0 if x=='No' else 1)

In [None]:
df2.dtypes

## 2.6. Split data between Categorical and Numerical data

In [None]:
df2.sample()

In [None]:
num_data = df2[['age', 'annual_premium', 'vintage']]
cat_data = df2[['gender', 'vehicle_age' ,'driving_license', 'region_code', 'previously_insured', 'vehicle_damage', 'policy_sales_channel']]

## 2.7. Descriptive Statistics

### Relevant metrics

In [None]:
num_mean = pd.DataFrame( num_data.apply( np.mean ) ).T
num_std = pd.DataFrame( num_data.apply( np.std ) ).T
num_median = pd.DataFrame( num_data.apply( np.median ) ).T
num_min = pd.DataFrame( num_data.apply( min ) ).T
num_max = pd.DataFrame( num_data.apply( max ) ).T
num_range = pd.DataFrame( num_data.apply( lambda x: x.max() - x.min() ) ).T
num_skew = pd.DataFrame( num_data.apply( lambda x: x.skew() ) ).T
num_kurtosis = pd.DataFrame( num_data.apply( lambda x: x.kurtosis() ) ).T

analysis_table = pd.concat( [num_mean, num_std, num_median, num_min, num_max, num_range, num_skew, num_kurtosis] ).T
analysis_table.columns = ['mean', 'std', 'median', 'min', 'max', 'range', 'skew', 'kurtosis']

In [None]:
analysis_table

In [None]:
plt.figure(figsize=(8,8))
plt.xticks( np.arange(0, 100000, 5000), rotation=-30 )
plt.axvline(x=num_mean['annual_premium'][0], ymin=0, ymax=1, label='mean', color='r')
plt.axvline(x=num_median['annual_premium'][0], ymin=0, ymax=1, label='median', color='g')
plt.legend()
sns.distplot( df2['annual_premium'] ).set_xlim(1,100000)

In [None]:
# Amount of categories in each categorical variable
cat_data.apply( lambda x: x.unique().shape[0] )

#### Boxplots

In [None]:
plt.figure(figsize=(12,20))
plt.subplots_adjust(left=None, bottom=None, right=2, top=None, wspace=None, hspace=None)

plt.subplot(2,2,1)
aux = df2[ df2['gender'] == 'Male' ]
sns.countplot( aux['response'] ).set_title('Men\'s insurance acceptance')

plt.subplot(2,2,2)
aux2 = df2[ df2['gender'] == 'Female' ]
sns.countplot( aux2['response'] ).set_title('Women\'s insurance acceptance')

plt.subplot(2,2,3)
aux3 = df2[ df2['vehicle_damage'] == 1 ]
sns.countplot( aux3['response'] ).set_title('Insurance acceptance of people who already had a vehicle damage')

plt.subplot(2,2,4)
aux4 = df2[ df2['vehicle_damage'] == 0 ]
sns.countplot( aux4['response'] ).set_title('Insurance acceptance of people who have never had a vehicle damage')


In [None]:
cat_data

# 3.0. Feature Engineering

In [None]:
df3 = df2.copy()

In [None]:
Image('img/CAR_INSURANCE_MIND_MAP.png')

## 3.1. Hypothesis creation

### 3.1.1. Hypothesis about the client

**1.** Clients with higher VINTAGE, have higher chances of accepting a new service from the company.

**2.** Clients that already have a car's insurance have lower chances of accepting another car's insurance.

**3.** Clients that have a driving license are more proned to accept a car's insurance, than those that dont have one.

**4.** Clients that are young have higher chances of purchasing a car's insurance.

**5.** Clients who live in dangerous areas are more proned to purchase a car's insurance.

**6.** Clients whose annual premium is expensive, tends to not purchase a car's insurance.

### 3.1.2. Hypothesis about the vehicle

**1.** Clients whose vehicle is new are more proned to purchase a car's insurance.

**2.** Clients whose vehicle has been already damaged, tends to purchase a car's insurance.

## 3.2. Final hypothesis list

Choose the hypothesis that the needed data are available for prediction.

**1.** Clients with higher VINTAGE, have higher chances of accepting a new service from the company.

**2.** Clients that already have a car's insurance have lower chances of accepting another car's insurance.

**3.** Clients that have a driving license are more proned to accept a car's insurance, than those that dont have one.

**4.** Clients that are young have higher chances of purchasing a car's insurance.

**5.** Clients who live in dangerous areas are more proned to purchase a car's insurance.

**6.** Clients whose annual premium is expensive, tends to not purchase a car's insurance.

**7.** Clients whose vehicle is new are more proned to purchase a car's insurance.

**8.** Clients whose vehicle has been already damaged, tends to purchase a car's insurance.

In [None]:
df3.sample()

## 3.3. Deriving New Variables

# 4.0. Exploratory Data Analysis

In [None]:
df4 = df3.copy()

## 4.1. Univariate Analysis

### 4.1.1. Response Variable

In [None]:
sns.countplot( df4['response'] )

### 4.1.2. Numerical Variables

In [None]:
plt.figure(figsize=(17,11))
plt.subplot(2,2,1)
plt.title('Distribution of ages')
plt.xticks( np.arange(0, 100, 5), rotation=0 )
sns.histplot(df4['age'])

plt.subplot(2,2,2)
plt.title('Distribution of annual premium')
plt.xticks( np.arange(0, 70000, 5000), rotation=-30 )
plt.xlim(0,70000)  
sns.histplot(df4['annual_premium'])

plt.subplot(2,2,3)
plt.title('Distribution of vintage')
plt.xticks( np.arange(0, 320, 20), rotation=-20 )
sns.histplot(df4['vintage'])

### 4.1.3. Categorical Variables

In [None]:
cat_data.columns

#### Gender

In [None]:
plt.figure(figsize=(13,7))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)

# Amount of data collected from each gender
plt.subplot(1,2,1)
plt.title('Amount of data collected from each gender')
sns.countplot(df4['gender'])


# Percentages of purchases intention in male gender

# Selecting just the gender male
df_male = df4[ df4['gender'] == 'Male' ]
male_num_accept = df_male[ df_male['response'] == 1 ].count()[0]
male_num_reject = df_male[ df_male['response'] == 0 ].count()[0]
male_accept_perc = (male_num_accept) / (male_num_accept + male_num_reject)

# Percentage of purchases intention in female gender
plt.subplot(1,2,2)
# Selecting just the gender female
df_female = df4[ df4['gender'] == 'Female' ]
female_num_accept = df_female[ df_female['response'] == 1 ].count()[0]
female_num_reject = df_female[ df_female['response'] == 0 ].count()[0]
female_accept_perc = (female_num_accept) / (female_num_accept + female_num_reject)

plt.title('Percentage of purchases intention per GENDER')
#plt.yticks( np.arange(0,1,0.1), rotation=0 )
sns.barplot( x=['\nMALE ({} %)'.format( np.round ( 100 * male_accept_perc, 3 ) ),
                '\nFEMALE ({} %)'.format( np.round (100 * female_accept_perc, 3 ) )],
             y=[male_accept_perc, female_accept_perc] );


#### Vehicle's Age

In [None]:
title1 = 'Amount of cars per age category'
title2 = 'Percentage of purchases intention \n by CAR\'S AGE'
textos = ['\nNew \n  ({} %)', '\nAlmost new \n  ({} %)\n', '\nOld \n ({} %)']
categories = ['new', 'almost_new', 'old']
createCategoryPlots(13, 7, df4, 'vehicle_age', title1, title2, categories, textos)


#### Driving License

In [None]:
title1 = 'Amount of people that have or not a driving license'
title2 = 'Percentage of purchases intention \n by having or not a DRIVING LICENSE'
textos = ['\nHas a driving license \n  ({} %)', '\nDoesn\'t have a driving license  \n ({} %)']
categories = [1, 0]
createCategoryPlots(15, 7, df4, 'driving_license', title1, title2, categories, textos)

In [None]:
df4.sample()

#### Region Code

In [None]:
plt.figure(figsize=(15,15))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)

plt.subplot(3,1,1)
plt.title('Amount of interviewed customers per region code')
sns.countplot( df4['region_code'] )

# Calculating the percentage of interested customers per region code
df_accept_per_region_code = df4[ df4['response'] == 1 ]
df_reject_per_region_code = df4[ df4['response'] == 0 ]

region_code_accept_num = np.array( df_accept_per_region_code.groupby('region_code').size() )
region_code_reject_num = np.array( df_reject_per_region_code.groupby('region_code').size() )

region_code_perc_accept = region_code_accept_num / (region_code_accept_num + region_code_reject_num)

plt.subplot(3,1,2)
plt.title('Percentage of purchasing interested by region code')
sns.barplot(x= np.sort( df4['region_code'].unique() ), y=region_code_perc_accept )


#### previously_insured

In [None]:
title1 = 'Amount of interviewed customers that were \n previously insured or not'
title2 = 'Percentage of purchases intention \n by have being or not PREVIOUSLY INSURED'
textos = ['\nHas been already insured \n  ({} %)', '\nHas never been insured \n ({} %)']
categories = [1, 0]
createCategoryPlots(15, 7, df4, 'previously_insured', title1, title2, categories, textos)

In [None]:
df4.sample()


#### vehicle_damage

In [None]:
title1 = 'Amount of interviewed customers that the car was \n previously damaged'
title2 = 'Percentage of purchases intention \n if the car has been already damage or not'
textos = ['\nThe car has been \n already damaged \n ({} %)', '\nThe car has never been damaged \n ({} %)']
categories = [1, 0]
createCategoryPlots(15, 7, df4, 'vehicle_damage', title1, title2,categories, textos)

#### policy_sales_channel 

In [None]:
plt.figure(figsize=(20,15))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)

plt.subplot(2,1,1)
plt.title('Amount of policy sales channel')
sns.countplot( df4['policy_sales_channel'] )

# Calculating the percentage of interested customers per region code
df_accept_per_region_code = df4[ df4['response'] == 1 ]
df_reject_per_region_code = df4[ df4['response'] == 0 ]

accept_l = df_accept_per_region_code.groupby('policy_sales_channel').size().reset_index()
reject_l = df_reject_per_region_code.groupby('policy_sales_channel').size().reset_index()

aux = pd.merge(accept_l, reject_l, how='outer', on='policy_sales_channel').rename( columns={'0_x':'accept', '0_y':'reject'})
aux = aux.fillna(value=0)

region_code_accept_num = np.array( aux.sort_values(by='policy_sales_channel')['accept'] )
region_code_reject_num = np.array( aux.sort_values(by='policy_sales_channel')['reject'] )

region_code_perc_accept = region_code_accept_num / (region_code_accept_num + region_code_reject_num)

plt.subplot(2,1,2)
plt.title('Percentage of purchasing interested by policy sales channel')
plt.xticks(rotation=90)
sns.barplot(x= np.sort( df4['policy_sales_channel'].unique() ), y=region_code_perc_accept );

## 4.2. Bivariate Analysis

### HIP 1. Clients with higher VINTAGE, have higher chances of accepting a new service from the company.

FALSE

In [None]:
aux_accept = df4[ df4['response'] == 1 ]
aux_reject = df4[ df4['response'] == 0 ]

plt.figure(figsize=(15,12))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)

plt.subplot(2,2,1)
plt.title('Distribution of vintage with insurance acceptance')
sns.histplot(data=aux_accept, x='vintage', color='green', kde=True)

plt.subplot(2,2,2)
plt.title('Distribution of vintage with insurance rejection')
sns.histplot(data=aux_reject, x='vintage', color="red", kde=True)


plt.subplot(2,2,3)
plt.title('Distribution of vintage with insurance acceptance')
sns.kdeplot(data=aux_accept, x='vintage', color='green')

plt.subplot(2,2,4)
plt.title('Distribution of vintage with insurance rejection')
sns.kdeplot(data=aux_reject, x='vintage', color='red');


### HIP 2. Clients that already have a car's insurance have lower chances of accepting another car's insurance.

TRUE

In [None]:
title1 = 'Amount of interviewed customers that were \n previously insured or not'
title2 = 'Percentage of purchases intention \n by have being or not PREVIOUSLY INSURED'
textos = ['\nHas been already insured \n  ({} %)', '\nHas never been insured \n ({} %)']
categories = [1, 0]
createCategoryPlots2(15, 7, df4, 'previously_insured', title1, title2, categories, textos)

### HIP 3. Clients that have a driving license are more proned to accept a car's insurance, than those that dont have one.

TRUE

In [None]:
title1 = 'Amount of interviewed customers that have \n or not a DRIVING LICENSE'
title2 = 'Percentage of purchases intention \n by having or not a DRIVING LICENSE'
textos = ['\nHas a driving license \n  ({} %)', '\nDoesnt have a driving license \n ({} %)']
categories = [1, 0]
createCategoryPlots(15, 7, df4, 'driving_license', title1, title2, categories, textos)

### HIP 4. Clients that are young have higher chances of purchasing a car's insurance.

FALSE

In [None]:
aux_accept = df4[ df4['response'] == 1 ]
aux_reject = df4[ df4['response'] == 0 ]

plt.figure(figsize=(15,12))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)

plt.subplot(2,3,1)
plt.title('Distribution of age')
sns.histplot(data=df4, x='age', kde=True)

plt.subplot(2,3,2)
plt.title('Distribution of age with \n insurance acceptance')
sns.histplot(data=aux_accept, x='age', color='green', kde=True)

plt.subplot(2,3,3)
plt.title('Distribution of age with \n insurance rejection')
sns.histplot(data=aux_reject, x='age', color="red", kde=True)


plt.subplot(2,3,4)
plt.title('Distribution of age with \n insurance acceptance')

blue_patch = mpatches.Patch(color='blue', label='data')
green_patch = mpatches.Patch(color='green', label='acceptance')
red_patch = mpatches.Patch(color='red', label='rejection')
plt.legend(handles=[blue_patch, green_patch, red_patch])

line1=sns.kdeplot(data=df4, x='age', color='blue');
line2=sns.kdeplot(data=aux_accept, x='age', color='green')
line3=sns.kdeplot(data=aux_reject, x='age', color='red');


plt.subplot(2,3,5)
plt.title('Distribution of age with \n insurance acceptance')

blue_patch = mpatches.Patch(color='blue', label='data')
green_patch = mpatches.Patch(color='green', label='acceptance')
red_patch = mpatches.Patch(color='red', label='rejection')
plt.legend(handles=[blue_patch, green_patch, red_patch])

sns.histplot(data=df4, x='age', color='blue');
sns.histplot(data=aux_reject, x='age', color='red');
sns.histplot(data=aux_accept, x='age', color='green')

#plt.subplot(2,3,5)
#plt.title('Distribution of vintage with insurance rejection')


### HIP 5. Clients who live in dangerous areas are more proned to purchase a car's insurance.

INCONCLUSIVE - We don't information about the violence in each region, we just noticed analysing the data<br>
that some regions are more proned to purchase a car insurance than others regions.

In [None]:
plt.figure(figsize=(15,15))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)

plt.subplot(3,1,1)
plt.title('Amount of interviewed customers per region code')
sns.countplot( df4['region_code'] )

# Calculating the percentage of interested customers per region code
df_accept_per_region_code = df4[ df4['response'] == 1 ]
df_reject_per_region_code = df4[ df4['response'] == 0 ]

region_code_accept_num = np.array( df_accept_per_region_code.groupby('region_code').size() )
region_code_reject_num = np.array( df_reject_per_region_code.groupby('region_code').size() )

region_code_perc_accept = region_code_accept_num / (region_code_accept_num + region_code_reject_num)

plt.subplot(3,1,2)
plt.title('Percentage of purchasing interested by region code')
sns.barplot(x= np.sort( df4['region_code'].unique() ), y=region_code_perc_accept );


### HIP 6. Clients whose annual premium is expensive, tends to not purchase a car's insurance.

FALSE - Clients whose annual premium is expensive tends to purchase a car insurance

In [None]:
plt.figure(figsize=(15,12))
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=0.3)


plt.subplot(3,2,1)
plt.title('Distribution of acceptance having an \n annual premium (<= 10,000)')
plt.xticks( np.arange(0, 10000, 1000), rotation=0 )
plt.xlim(0,10000)  
sns.histplot( df4[ (df4['response'] == 1) & (df4['annual_premium'] <= 10000) ]['annual_premium'] )

plt.subplot(3,2,2)
plt.title('Distribution of acceptance having an \n annual premium (> 10,000)')
plt.xticks( np.arange(0, 70000, 5000), rotation=-30 )
plt.xlim(0,70000)  
sns.histplot( df4[ (df4['response'] == 1) & (df4['annual_premium'] > 10000) ]['annual_premium'] )

plt.subplot(3,2,3)
plt.title('Distribution of rejection having an \n annual premium (<= 10,000)')
plt.xticks( np.arange(0, 10000, 1000), rotation=-30 )
plt.xlim(0,10000)  
sns.histplot( df4[ (df4['response'] == 0) & (df4['annual_premium'] <= 10000) ]['annual_premium'] )

plt.subplot(3,2,4)
plt.title('Distribution of rejection having an \n annual premium (> 10,000)')
plt.xticks( np.arange(0, 70000, 5000), rotation=-30 )
plt.xlim(0,70000)  
sns.histplot( df4[ (df4['response'] == 0) & (df4['annual_premium'] > 10000) ]['annual_premium'] )

plt.subplot(3,2,(5,6))
plt.title('Distribution of acceptance and rejection having an \n annual premium (> 10,000)')
plt.xticks( np.arange(0, 70000, 3000), rotation=-30 )
plt.xlim(0,70000)  
green_patch = mpatches.Patch(color='green', label='acceptance')
red_patch = mpatches.Patch(color='red', label='rejection')
plt.legend(handles=[green_patch, red_patch])

sns.kdeplot( df4[ (df4['response'] == 1) & (df4['annual_premium'] > 10000) ]['annual_premium'], clip=(70000,5), color='green' )
sns.kdeplot( df4[ (df4['response'] == 0) & (df4['annual_premium'] > 10000) ]['annual_premium'], clip=(70000,5), color='red' );

### HIP 7. Clients whose vehicle is new are more proned to purchase a car's insurance.

FALSE - Clients whose vehicle is new are less proned to purchase a car's insurance

In [None]:
title1 = 'Amount of interviewed customers that have \n or not a DRIVING LICENSE'
title2 = 'Percentage of purchases intention \n by having or not a DRIVING LICENSE'
textos = ['\n old car \n  ({} %)', '\n almost new car \n ({} %)', '\n new car \n ({} %)']
categories = ['old', 'almost_new', 'new']
createCategoryPlots(15, 7, df4, 'vehicle_age', title1, title2, categories, textos)

### HIP 8. Clients whose vehicle has been already damaged tends to purchase a car insurance.

TRUE - Clients whose vehicle has been already damaged tend to purchase a car insurance

In [None]:
title1 = 'Amount of interviewed customers that have \n or not a DRIVING LICENSE'
title2 = 'Percentage of purchases intention \n by having or not a DRIVING LICENSE'
textos = ['\n has already been damaged \n  ({} %)', '\n has never been damaged \n ({} %)']
categories = [1, 0]
createCategoryPlots(15, 7, df4, 'vehicle_damage', title1, title2, categories, textos)

## 4.3. Multivariate Analysis

### 4.3.1. Numerical Attributes

In [None]:
correlation = num_data.corr( method='pearson' )
sns.heatmap( correlation, annot=True)

### 4.3.2. Categorical Attributes

In [None]:

def createHeatmapForCategoricalData( df ):
    quant = df.columns.shape[0]
    categories = list( df.columns )
    values_cramerv = []
    
    for i in range(quant):
        for j in range(quant):
            values_cramerv.append( cramer_v( cat_data[ categories[i] ], cat_data[ categories[j] ] ) )
    
    tab = pd.DataFrame({
                        'gender': [values_cramerv[i] for i in range(7)],
                        'vehicle_age': [values_cramerv[i+7] for i in range(7)],
                        'driving_license': [values_cramerv[i+7*2] for i in range(7)],
                        'region_code': [values_cramerv[i+7*3] for i in range(7)],
                        'previously_insured': [values_cramerv[i+7*4] for i in range(7)],
                        'vehicle_damage': [values_cramerv[i+7*5] for i in range(7)],
                        'policy_sales_channel': [values_cramerv[i+7*6] for i in range(7)]
    })
    tab.set_index(tab.columns)
    
    return tab

tab = createHeatmapForCategoricalData( cat_data )
plt.figure(figsize=(15,12))
sns.heatmap( tab.corr(method='pearson'), annot=True)

In [None]:
list( cat_data.columns )

# 5.0. Data Preparation

In [None]:
df5 = df4.copy()

In [None]:
num_data.columns

## 5.1. Rescaling

### Age

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(2,2,1)
sns.histplot(df5['age'])

plt.subplot(2,2,2)
sns.boxplot(df5['age'])

mms = MinMaxScaler()
rs = RobustScaler()

df5['age'] = mms.fit_transform( df5[['age']].values )

plt.subplot(2,2,3)
sns.histplot(df5['age'])

plt.subplot(2,2,4)
sns.boxplot(df5['age'])

### Annual Premium

In the annual_premium feature there are a lot of outliers, so we are going to use Robust Scaler, <br>
because it's less sensitive to outliers than the Min-Max Scaler

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(2,2,1)
sns.histplot(df5['annual_premium'])

plt.subplot(2,2,2)
sns.boxplot(df5['annual_premium'])

rs = RobustScaler()

df5['annual_premium'] = rs.fit_transform( df5[['annual_premium']].values )

plt.subplot(2,2,3)
sns.histplot(df5['annual_premium'])

plt.subplot(2,2,4)
sns.boxplot(df5['annual_premium'])


### Vintage

There are no outliers, so we are going to use Min Max Scaler

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(2,2,1)
sns.histplot(df5['vintage'])

plt.subplot(2,2,2)
sns.boxplot(df5['vintage'])

mms = MinMaxScaler()

df5['vintage'] = mms.fit_transform( df5[['vintage']].values )

plt.subplot(2,2,3)
sns.histplot(df5['vintage'])

plt.subplot(2,2,4)
sns.boxplot(df5['vintage'])

## 5.2. Transformation

### 5.2.1. Encoding

In [None]:
cat_data.columns

#### Gender

In [None]:
# Label encoder - because there is no order relation 
le = LabelEncoder()
df5['gender'] = le.fit_transform( df5['gender'] )

#### Vehicle Age

In [None]:
# Ordinal encoder - because there is a order relation 
vehicle_age_dict = {'new': 1, 'almost_new': 2, 'old': 3}
df5['vehicle_age'] = df5['vehicle_age'].map( vehicle_age_dict )

#### Driving License

One Hot Encoding - because the variable represents a state

In [None]:
df5 = pd.get_dummies(df5, prefix=['driving_license'], columns=['driving_license'])

#### Region Code

Target Encoding - contains the percentage of purchase acceptance per region code

In [None]:
te = df5[ df5['response'] == 1 ].groupby('region_code').size() / df5.groupby('region_code').size()
df5['region_code'] = df5['region_code'].map( te )

#### Previously Insured

One Hot Encoding - it represents a state

In [None]:
df5 = pd.get_dummies(df5, prefix=['previously_insured'], columns=['previously_insured'])

#### Vehicle Damage

One Hot Encoding - it represents a state

In [None]:
df5 = pd.get_dummies(df5, prefix=['vehicle_damage'], columns=['vehicle_damage'])

#### Policy Sales Channel

Target Encoding

In [None]:
all_policy_sales_channels = np.sort( df5['policy_sales_channel'].unique() )
dt_all_policy_sales_channels = pd.DataFrame(all_policy_sales_channels, columns=['policy_sales_channel'])

accept_policy_sales_channels = df5[ df5['response'] == 1 ].groupby('policy_sales_channel').size()
dt_accept_policy_sales_channels = pd.DataFrame(accept_policy_sales_channels, columns=['sum_of_values']).reset_index()

aux5 = pd.merge(dt_all_policy_sales_channels, dt_accept_policy_sales_channels, how='left', on='policy_sales_channel')
aux5['sum_of_values'] = aux5['sum_of_values'].apply(lambda x: 0 if math.isnan(x) else x)

aux6 = pd.DataFrame( df5.groupby('policy_sales_channel').size(), columns=['quantity'])
aux7 = pd.merge(aux5, aux6, how='left', on='policy_sales_channel')
aux7['perc'] = aux7['sum_of_values'] / aux7['quantity']

dict_policy_sales_channel = pd.Series(aux7.perc.values, index=aux7.policy_sales_channel).to_dict()

df5['policy_sales_channel'] = df5['policy_sales_channel'].map( dict_policy_sales_channel )

# 6.0. Feature Selection

In [None]:
df6 = df5.copy()

In [None]:
df6

## 6.1. Splitting dataframe into training and test set 

In [None]:
# Shuffling the dataset rows
aux = df6.sample(frac=1)

x_train = aux.iloc[0 : 8 * ( int( len(aux)/10 ) ), :] 
y_train = x_train['response']

x_test = aux.iloc[8 * ( int( len(aux)/10 ) ) : len(aux), :]
y_test = x_test['response']

In [None]:
aux.isna().sum()

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

## 6.2. Boruta - Feature Selection

In [None]:
x_train_vec = x_train.drop( ['id','response'], axis=1 ).values
y_train_vec = y_train.values.ravel()

rf = RandomForestClassifier(max_depth=10, n_jobs=-1) 
boruta = BorutaPy(rf, n_estimators='auto', verbose=1, max_iter=20, random_state=42).fit( x_train_vec, y_train_vec )

### 6.2.1 Best Features selected by Boruta

In [None]:
cols_selected = boruta.support_.tolist()

x_train_fs = x_train.drop( ['id', 'response'], axis=1 )

# Columns selected by Boruta Algorithm
cols_name_selected = x_train_fs.iloc[:,cols_selected].columns.tolist()

# Columns not selected by Boruta Algorithm
cols_name_not_selected = list( np.setdiff1d( x_train_fs.columns, cols_name_selected ) )

cols_name_selected

In [None]:
cols_name_not_selected

## 6.3. Manual Feature Selection

Let's join the features selected by the Boruta Algorithm with the features we suppose are relevant <br> accordingly with the Exploratory Data Analysis.

In [None]:
best_features = ['age',
                 'region_code',
                 'vehicle_age',
                 'annual_premium',
                 'policy_sales_channel',
                 'previously_insured_0',
                 'previously_insured_1',
                 'vehicle_damage_0',
                 'vehicle_damage_1']

not_selected_features = ['driving_license_0',
                         'driving_license_1',
                         'gender',
                         'vintage']

# 7.0. Machine Learning Modeling

In [None]:
x_train_aft_feat_selec = x_train[ best_features ]
x_test_aft_feat_selec = x_test[ best_features ]

# columns to add
best_features.extend( ['response'] )

## 7.3. Random Forest

In [None]:
# model
rf = RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=41)

rf_fit = rf.fit( x_train_aft_feat_selec, y_train )

# prediction
y_hat_rf = rf_fit.predict( x_test_aft_feat_selec )

# performance
performance = ml_error('Random forest', y_test, y_hat_rf)
performance

## 7.4. XGBoost

In [None]:
# model
model_xgb = xgb.XGBClassifier()

model_xgb_fit = model_xgb.fit( x_train_aft_feat_selec, y_train )

y_hat_xgb = model_xgb_fit.predict( x_test_aft_feat_selec )

# performance
performance = ml_error('XGBoost', y_test, y_hat_xgb)
performance

## 7.5. Comparing Model Performance 

In [None]:
a=pd.DataFrame([precision_rf], columns=['Random Forest'])
b=pd.DataFrame([precision_xgb], columns=['XGBoost'])
pd.concat([a,b])
#model_performance = [precision_rf, precision_xgb] 
#model_performance

# 8.0. HyperParameter Fine Tuning

## 8.1. Random Search

In [None]:
param = {
            'n_estimators': [100, 200, 500],
            'max_depth': [8, 10, 15]
        }

In [None]:
MAX_ITER = 5
final_result = pd.DataFrame()


for i in range( MAX_ITER ):
    # choose values for parameters randomly
    hft = { k: random.sample( v, 1 )[0] for k, v in param.items() }
    print( hft )

    # model 
    rf = RandomForestClassifier( n_estimators=hft['n_estimators'],
                                max_depth=hft['max_depth'],
                                n_jobs=-1,
                                random_state=42 )

    # fit
    rf_fit = rf.fit( x_train_aft_feat_selec, y_train )
    
    # prediction
    y_hat = rf_fit.predict( x_test_aft_feat_selec )
    
    # performance
    hits = 0
    for i in range(len(y_test)):
        if (y_test.values[i] == y_hat[i]):
            hits = hits + 1
    precision = hits / len(y_test)  
    
    final_result = pd.concat( [final_result, precision ])
