In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Important Libraries

In [None]:
import math
import datetime
import seaborn as sns
import xgboost as xgb
import pandas as pd
import numpy as np
import plotly.express as px
import pandas_profiling as pp
from xgboost import XGBRegressor
from sklearn.metrics import f1_score,classification_report
import plotly.figure_factory as ff
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split,GridSearchCV

print('Import Success')

# Loading Training Data

In [None]:
df_train = pd.read_csv("/kaggle/input/Tow-Mater-Labs/train.csv")
df_train.describe()

In [None]:
df_train.head(5)

In [None]:
#info of the training set.
df_train.info()

# Creating a Profile Report

In [None]:
#pp.ProfileReport(df_train)

After studying the profile report for various Features, the observations that can be drawn about the data are :
1. **About Data :**<br>
    - Data contains a total of 15 features (including the label 'accepted').<br>
    - Out of these Features, 8 features are categorical, 6 are continuous and 1 is the unique 'id' for each indivisual(datapoint).
    
2. **About Features :**<br>
  
      - **id :** Unique identification for a particular person(datapoint). Is of no use in predicting the label.
        
      - **blood_group,gender :** Balanced categorical feature - has all categories equally distributed.
        
      - **age,annual_premium,policy_sales_channel,vintage,mother_age,father_age :**<br>
           a. Continuous Features of the data.<br>
           b. annual_premium has very large values in comparison to other features,might need to normalize.
           
      - **driving_license :** highly skewed for class 1. i.e most of the datapoints have value 1 for this feature.**Might be helpful in predicting the labels.**
      
      - **Region_code :** not much of an informative feature. Though it is also partially imbalanced for **region 28.**
      
      - **Previously_insured,Vehicle_damage :** These are balanced features. **There might be a correlation between these features and the label, hence can be important.**
      
      - **Vehicle_Age :** It contains three categories, out of which two are balanced, while third category is quite less in frequency.(might have a correlation with the label)
      
      - **accepted :** This is the label that we need to predict. It his highly skewed(imbalanced for class - 0).

In [None]:
#Imbalance in data labels : 'accepted'
plt.figure(figsize=(16,6))
sns.countplot(x = 'accepted',data = df_train)
plt.title('frequency of people accepting the insurance',fontweight = 'bold')
plt.show()
print('accepted Details :')
print(df_train['accepted'].value_counts())

# Data Preprocessing

## - Checking and removal of NULL values

In [None]:
for i in df_train.columns:
    print( i+" \t: " +str(df_train[i].isnull().sum()))

## - Checking for categories of Categorical Features.

In [None]:
columns = ['Gender', 'Region_Code', 'Vehicle_Age', 'Vehicle_Damage','Previously_Insured']
for column in columns:
    print("{} : {}".format(column,np.unique(df_train[column])))

## Custom Encoding of columns ~ 'Previously_Insured'

### - Encoding "Region_Code"

In [None]:
#frequency of categories of 'Region_Code'
print(df_train['Region_Code'].value_counts(ascending = False))

In [None]:
#clearly we can see that the frequency of categories in this feature is not uniform,hence encoding all of the columns of this feature, might not be fruitful.
#So, Under a generalization, I have encoded only those categories with frequency > 2600.

#encoding "Region_Code"

col_to_create = np.unique(df_train['Region_Code'])
vals = list(col_to_create)

encoding_column = ['Region_Code']
unique_sets = []

for column in encoding_column:
    
    unique_vals = vals
    unique_set = []
    
    for val in unique_vals:
        
        encode = []
        for element in df_train[column]:
            
            if element==val:
                encode.append(1)
            else:
                encode.append(0)
                
        #filtering under frequency generalization.
        if sum(encode)>2600:
            
            unique_set.append(column+"_"+str(val))
            df_train[column+"_"+str(val)] = encode
            
    unique_sets.append(unique_set)

In [None]:
#Region_codes with frequency > 2600.
print(unique_sets)

In [None]:
df_train.head(2)

### - Encoding "Vehicle_Age"

In [None]:
#One-Hot kind of encoding for this feature might lead to different correlation of this feature with the label.
#Hence, this feature has been encoded, on the basis of average of age that the category says: e.g : '< 1 Year' == 0.5 , '1-2 Year' == 1.5 and '> 2 Years' == 2.5
#Custom Encoding

encode = []

for val in df_train['Vehicle_Age']:
    
    if val=='< 1 Year':
        encode.append(0.5)
    elif val == '1-2 Year':
        encode.append(1.5)
    else:
        encode.append(2.5)
df_train['Vehicle_Age'] = encode
df_train.head(2)

### - Encoding "Vehicle_Damage"

In [None]:
#This Feature contains two classes : "Yes","No"
#encoding has been done as : "Yes" : 1,"No" : 0

encode = []
for val in df_train['Vehicle_Damage']:
    
    if val == 'Yes':
        encode.append(1)
    else:
        encode.append(0)
df_train['Vehicle_Damage'] = encode
df_train.head(2)

### - Encoding "Gender"

In [None]:
#This feature contains two classes : "Male","Female"
#encoding has been done as : "Male" : 1,"Female" : 0

encode = []
for val in df_train['Gender']:
    
    if val == "Male":
        encode.append(1)
    else:
        encode.append(0)
df_train['Gender'] = encode
df_train.head(2)

### - Why not encode "bloog_group" Feature ?

- **Since, there is no visible variation between various Blood_Groups and people accepting to take insurance. So, i didn't encode "blood_group" Feature.**

In [None]:
#count of vechile damage in data(w.r.t to acceptance) 
plt.figure(figsize=(16,6))
ax = sns.countplot(x = 'blood_group',hue = 'accepted',data = df_train,palette=['#432371',"#FAAE7B"])
plt.title('frequency of blood Groups w.r.t them accepting to insure',fontweight = 'bold')
plt.show()

## - Vehicle_Damage w.r.t acceptance

In [None]:
#count of vechile damage in data(w.r.t to acceptance) 
plt.figure(figsize=(16,6))
ax = sns.countplot(x = 'Vehicle_Damage',hue = 'accepted',data = df_train)
plt.title('frequency of those having damaged vehicle w.r.t those who accept to insure',fontweight = 'bold')
plt.show()

## Conclusion:
### - **Those whose vehicle is damaged is most-likely to say 'NO' to acceptance of insurance.**

In [None]:
#count of people who got there vehicle previously insured(w.r.t acceptance)
plt.figure(figsize=(16,6))
ax = sns.countplot(x = 'Previously_Insured',hue = 'accepted',data = df_train,palette=['#FFC300',"#FF5733"])
plt.title('frequency of people who had gotten their vehicle previously insured w.r.t them accepting to insure the vechile',fontweight = 'bold')
plt.show()

## Conclusion :
- <h3>Above plot shows that, if the person has got his vehicle previously insured : He won't accept the insurance.

## Encoded_Dataset

In [None]:
df_train

In [None]:
print(df_train.columns)

# Data Visualization

## - Correlation matrix for dataset ~ "Region Code"

In [None]:
height = 600
title = '<b>Correlation Matrix for the dataset:</b>'
colors = 'Viridis'

#-------------------------------------------------------------------------#
df_train = df_train
classes = ['Age', 'Driving_License','Previously_Insured', 'Annual_Premium',
           'Policy_Sales_Channel', 'Vintage', 'Vehicle_Age', 'Vehicle_Damage',
           'Gender','accepted']
#-------------------------------------------------------------------------#

correlation = df_train[classes].corr()
correlation_mat = df_train[classes].corr().to_numpy()
correlation_mat = (correlation_mat//0.0001)/10000
correlation_mat_norm = (correlation_mat//0.01)/100

fig = ff.create_annotated_heatmap(correlation_mat, x=classes, y=classes,
                                  annotation_text=correlation_mat_norm,
                                  colorscale=colors,text = correlation_mat,
                                  hovertemplate='Column: %{x}<br>'+
                                                'Row: %{y}<br>'+
                                                'Correlation: %{text}<extra></extra>')


fig.update_layout(title_text= title,width = (height*(1.618))//1,height = height,
                  xaxis = {'title':'Columns'},
                  yaxis = {'title':'Rows','autorange':'reversed'})
fig.update_traces(showscale = True)
fig.show()

### **From the above Correlation Matrix, we can observe that , features like -** <br>
#### - Vehicle_Age,Vehicle_Damage show relatively high +ve correlation with 'accepted'(final label).
#### - Age also shows +ve correlation with 'accepted'.
#### - Previously_Insured shows relatively high -ve correlation with 'accepted'.
#### - Policy_sales_Channel also shows -ve correlation with 'accepted'.

## - Correlation matrix for "Region Code"

In [None]:
height = 800
title = '<b>Correlation Matrix for the dataset:</b>'
colors = 'Viridis'
#-------------------------------------------------------------------------#

df_train = df_train
classes = unique_sets[0]+['accepted']

#-------------------------------------------------------------------------#
correlation = df_train[classes].corr()
correlation_mat = df_train[classes].corr().to_numpy()
correlation_mat = (correlation_mat//0.0001)/10000
correlation_mat_norm = (correlation_mat//0.01)/100

fig = ff.create_annotated_heatmap(correlation_mat, x=classes, y=classes,
                                  annotation_text=correlation_mat_norm,
                                  colorscale=colors,text = correlation_mat,
                                  hovertemplate='Column: %{x}<br>'+
                                                'Row: %{y}<br>'+
                                                'Correlation: %{text}<extra></extra>')


fig.update_layout(title_text= title,width = (height*(1.618))//1,height = height,
                  xaxis = {'title':'Columns'},
                  yaxis = {'title':'Rows','autorange':'reversed'})
fig.update_traces(showscale = True)
fig.show()

## Conclusion :
### - **As can be clearly seen in the above correlation plot(heatmap),only Region_code_28.0 shows a relatively high +ve correlation with 'accepted' label ; hence, it is an important feature to consider for classification.**

# - Useful Features

In [None]:
#from the above plots, the Features that seems to be useful are : 
useful_columns = ['Age', 'Driving_License','Previously_Insured',
           'Policy_Sales_Channel', 'Vehicle_Age', 'Vehicle_Damage','Region_Code_28.0', 'accepted']

print("Total number of useful Features : {} \n {}".format(len(useful_columns),useful_columns))

# - Preparing Training & Testing Sets

## 1. Training set

In [None]:
df_train_x = df_train[[x for x in useful_columns if x != 'accepted']]
df_train_y = df_train[['accepted']]

In [None]:
print(df_train_x.info())
print()
print(df_train_y.info())

## 2. Train test Split
<pre>
<b>This data has been split in two parts with size of the training part being 0.9 of the total size of the dataset,and remaining is the test dataset.</b>
</pre>

In [None]:
x_train,x_test,y_train,y_test = train_test_split(df_train_x, df_train_y,test_size=0.10,random_state=42)

## 3. Model Selection
<pre>
1. Model declaration.
2. Fitting data to the Model.
3. Making Predictions.
</pre>

In [None]:
#we have use enseble learning model - XGBoost classifier for classification.
clf = xgb.XGBClassifier()

In [None]:
#fitting the data in the model
clf.fit(x_train,y_train)

In [None]:
#prediction on test set.
prediction = clf.predict(x_test)
print("1's in y_pred : {}\n1's in y_test : {}".format(sum(prediction),sum(np.array(y_test))[0]))
print("\n0's in y_pred : {}\n0's in y_test : {}".format(len(prediction)-sum(prediction),len(prediction)-sum(np.array(y_test))[0]))

In [None]:
prediction_prob = clf.predict_proba(x_test)
print(prediction_prob)

print("\nResult Log : ")
print("predict_proba(x_test) returns a 2-D array of probabilities of the label being classified as [0,1], for each datapoint.")

In [None]:
# p(0) = 1-p(1)
# So, keeping both columns of prediction_prob is redundant!!
# Hence, we drop the 1's probabilities column and stack it with y_test; so as to draw interpretations of our model's prediction using violinplot.

prediction_prob = np.array(prediction_prob)
prediction_prob = np.hstack((prediction_prob[:,:1],y_test))

print("prediction_prob : \n\n",prediction_prob)

### - So, basically now what prediction_prob is that it's first column tells - what is the probability that the model will predict this datapoint as 0. And the second column tells - what was the label of this datapoint actually!!

In [None]:
fig = px.violin(y = prediction_prob[:,0],x = prediction_prob[:,1])
fig.show()

## - Classification Report

In [None]:
print(classification_report(y_test,prediction))

## Conclusion : 
### we observe that : 
### For label - 0 : our model has mostly been successful in predicting it right(as the distribution of the violin plot is more dense near 1).
### For label - 1 : our model has not been as successfull in predicting it right(as the distribution is more dense near 0.7). So, we can conclude that, although we may get a pretty good accuracy for this classification,still we might not get a good "average F1-score".

## Why so?
### It happens because; Our model is predicting class-0 with much more confidence than that with which it predicts the class-1. So precision and recall for our model predicting 1 is pretty "Low", which sets the overall F1-score average to decrease. 

## 4. Designing a scorer for calculating F1-Score

In [None]:
#designing a F1-scorer

def Custom_f1_score(y_true,y_pred):
    """
    y_true : true values of the datapoints; correct values.
    y_pred : predicted values
    
    return (x,y,z) : (x,y,z) f1_scores.
    
    x = f1_score for first class.(in binary class = 0).
    y = f1_score for second class.(in binary class = 1).
    z = average of the two.(z = (x+y)/2).
    
    Confusion_matrix:
    
         0   1
      0| A | B
         -   -
      1| C | D
    
    x = (2*A)/(2*A+B+C)
    
    y = (2*D)/(2*D+B+C)
    
    """
    
    f1_scores = []
    arr = np.zeros((2,2))
    
    for i,j in zip(y_true.to_numpy().ravel(),y_pred):
        arr[i][j]+=1
    
    denom_0 = (arr[0][0]+arr[0][1])+(arr[0][0]+arr[1][0])
    denom_1 = (arr[1][1]+arr[0][1])+(arr[1][1]+arr[1][0])
    
    f1_scores.append(2*arr[0][0]/(denom_0))
    f1_scores.append(2*arr[1][1]/(denom_1))
    
    return (f1_scores[0],f1_scores[1],sum(f1_scores)/2)

### - Correctness of our F1-scorer :)

In [None]:
cvar = Custom_f1_score(y_test,prediction)
print("using in-built metrics : {}\nusing Custom_f1_score : {}".format((sum(f1_score(y_test,prediction,average = None))/2),cvar[2]))

# Setting Threshold
### <li>Setting custom Threshold over the probabilities that our model has predicted for each datapoint and then chosing the right Threshold that gives the best F1-Score.

In [None]:
f1_scores = []

#threshold is incremented with 0.01 on each iteration. 
for threshold in np.arange(0,1.01,0.01):
    
    predictions = []
    for probability in prediction_prob[:,0]:
        
        if(probability>threshold):
            predictions.append(0)
        else:
            predictions.append(1)
            
    score = Custom_f1_score(y_test,predictions)
    
    f1_scores.append((threshold,score))

In [None]:
#To hold the f1_score for various threshold values.
max_list = []

for val in f1_scores:
    print("Threshold value : {}\naverage f1-score : {}".format(val[0],val[1][2]))
    max_list.append(val[1][2])
    print()

In [None]:
print("Optimal Threshold : {}".format(max_list.index(max(max_list))/100))

# Testing Data Preparation For Submission

In [None]:
df_test = pd.read_csv('/kaggle/input/Tow-Mater-Labs/test.csv')
print('Read Successful')

In [None]:
df_test.info()

### - Various Encodings to perform on Test Data
<pre>
- Region_Code = 28.0
- Vehicle_Age
- Vehicle_Damage
- Gender
</pre>

In [None]:
#encode Region_Code 28
encode = []
for val in df_test['Region_Code']:
    
    if(val==28.0):
        encode.append(1)
    else:
        encode.append(0)
        
df_test['Region_Code_28.0'] = encode

In [None]:
#encode Vehicle_Age
encode = []
for val in df_test['Vehicle_Age']:
    
    if(val == '< 1 Year'):
        encode.append(0.5)
    elif(val == '1-2 Year'):
        encode.append(1.5)
    else:
        encode.append(2.5)

df_test['Vehicle_Age'] = encode

In [None]:
#encode Vehicle_Damage
encode = []
for val in df_test['Vehicle_Damage']:
    
    if(val == 'Yes'):
        encode.append(1)
    else:
        encode.append(0)
df_test['Vehicle_Damage'] = encode

In [None]:
#encode Gender
encode = []
for val in df_test['Gender']:
    
    if(val=='Male'):
        encode.append(1)
    else:
        encode.append(0)
        
        
df_test['Gender'] = encode

In [None]:
df_test.head(5)

In [None]:
final_predictions = clf.predict_proba(df_test[[x for x in useful_columns if x!='accepted']])
print("Final Probabilities : \n\n{}".format(final_predictions))

## - Predictions[] will store the final Predictions done by our model on the Dataset.

In [None]:
predictions = []

for probability in final_predictions[:,0]:
    
    if probability>0.73 :
        predictions.append(0)
    else:
        predictions.append(1)

print("number of 1's predicted : {}\nTotal number of datapoints : {}".format(sum(predictions),len(predictions)))

In [None]:
df_test['accepted'] = predictions
df_test[['id','accepted']].to_csv('submission.csv',index=False)

In [None]:
alpha = pd.read_csv('./submission.csv')
print(alpha['accepted'].value_counts())