# IMPORT LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from geopy.geocoders import Nominatim
import time
from pprint import pprint
from sklearn import metrics
import tensorflow as tf

# CONFIGURATION OF LIBRARIES

In [None]:
plt.figure(figsize=(14,8))

pd.set_option('display.max_columns', 50)

# READING DATA FROM SOURCE

Both datasets are stored in the same direcotry with script as csv files. 

In [None]:
dfPolicyData = pd.read_csv("PolicyData.csv", delimiter = ";", encoding='latin-1')
dfPolicyData.set_index("policy_guid", inplace = True)

dfInvoiceData = pd.read_csv("InvoiceData.csv", delimiter = ";")
dfInvoiceData.set_index("invoice_guid", inplace = True)


Premium and amount premium fields are not numberic. They are converted from object to float

In [None]:
dfInvoiceData["amount_premium"] = dfInvoiceData["amount_premium"].apply(lambda x: x.replace(',','.'), ).astype(float, errors = 'raise')
dfPolicyData["Premium"] = dfPolicyData["Premium"].apply(lambda x: x.replace(',','.'), ).astype(float, errors = 'raise')

In [None]:
dfPolicyData.head()

In [None]:
dfInvoiceData.head()

# DATA PREPROCESSING

dfModelData that represents all the possible attributes for input and target features.

In [None]:
dfModelData = dfPolicyData.copy()

# MISSING DATA INPUTATION

In [None]:
def ReplaceMissingDataWithClassifier(dfModelData, aCategoricalFeatures, aContinuousFeatures, sTargetFeature):

    dfModelDataCopy = dfModelData.copy()
    
    aFeaturesX = []
    
    for i in range(len(aCategoricalFeatures)):
        sCategoricalFeature = aCategoricalFeatures[i]

        # to avoid "other" value for multiple attributes
        dfMaskOther = dfModelDataCopy[sCategoricalFeature] == "OTHER"
        
        dfModelDataCopy.loc[dfMaskOther, sCategoricalFeature] = "OTHER_" + str(sCategoricalFeature)
        
        dfCategoricalFeatureClassified = pd.get_dummies(dfModelDataCopy[sCategoricalFeature])
        
        dfModelDataCopy = dfModelDataCopy.join(dfCategoricalFeatureClassified, on= "policy_guid")
        
        aFeaturesX = np.concatenate([aFeaturesX, dfCategoricalFeatureClassified.columns])
 
        
    aFeaturesX = np.concatenate([aFeaturesX, aContinuousFeatures])

    dfTargetFeatureClassified= pd.get_dummies(dfModelDataCopy[sTargetFeature])
    
    dfModelDataCopy = dfModelDataCopy.join(dfTargetFeatureClassified, on= "policy_guid")

    aFeaturesY = np.delete(dfTargetFeatureClassified.columns, np.where(dfTargetFeatureClassified.columns == "Missing") )

    dfMissingData = dfModelDataCopy[dfModelData[sTargetFeature] == "Missing"]
    dfNonMissingData = dfModelDataCopy.drop(dfMissingData.index, inplace = False)
    
    dfX = dfNonMissingData[aFeaturesX]
    dfY = dfNonMissingData[aFeaturesY]

    X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.3, random_state=1)

    oDecTreeModel =  DecisionTreeClassifier()
    oDecTreeModel.fit(X_train, y_train)

    y_pred = oDecTreeModel.predict(X_test)

    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

    print('Recall: ', metrics.recall_score(y_test, y_pred,zero_division=0,  average = 'micro'))

    print('Precision: ', metrics.precision_score(y_test, y_pred,zero_division=0,  average = 'micro'))

    print('F1-Score: ', metrics.f1_score(y_test, y_pred,zero_division=0,  average = 'micro'))
    
    dfX_Missing = dfMissingData[aFeaturesX]
    aPredictionsForMissing = oDecTreeModel.predict(dfX_Missing)
    dfPredictionsForMissing = pd.DataFrame(data = aPredictionsForMissing, columns = aFeaturesY, index = dfMissingData.index)
    
    aPredictedLabels = dfPredictionsForMissing.idxmax(axis=1)
    
    dfModelData.loc[dfMissingData.index,sTargetFeature] = aPredictedLabels

Columns that are NaN in policy dataset

In [None]:
dfPolicyData.columns[dfPolicyData.isna().any()].tolist()

Columns that are 'Missing' in policy dataset

In [None]:
dfMissing = dfPolicyData.astype(str) == "Missing"
dfPolicyData.columns[dfMissing.any()].tolist()

Columns that are NaN in invoice dataset

In [None]:
dfInvoiceData.columns[dfInvoiceData.isna().any()].tolist()

Columns that are 'Missing' in invoice dataset

In [None]:
dfMissing = dfInvoiceData.astype(str) == "Missing"
dfInvoiceData.columns[dfMissing.any()].tolist()

## Deductible_general

There are only 2 rows where Deductible_general is missing. 

Since they are relatively small amount of rows for this dataset, these rows are deleted.

In [None]:
dfMissingDecutibleGeneral = dfModelData[dfModelData["Deductible_general"].isna()]
dfModelData.drop(dfMissingDecutibleGeneral.index, inplace = True)

## ClientBirthday

There are only 43 rows where ClientBirthday is missing.

Since they are relatively small amount of rows for this dataset, these rows are deleted.

In [None]:
dfMissingClientBirthday = dfModelData[dfModelData["ClientBirthday"].isna()]
dfModelData.drop(dfMissingClientBirthday.index, inplace = True)

## BMClassMOD

There are only 14 rows where BMClassMOD is missing.

Since they are relatively small amount of rows for this dataset, these rows are deleted.

In [None]:
dfMissingBmClassMod = dfModelData[dfModelData["BMClassMOD"].isna()]
dfModelData.drop(dfMissingBmClassMod.index, inplace = True)

## avgFuelConsumption

There are 27927 rows where avgFuelConsumption is missing.

Since it s a big amount of rows, correlation between avgFuelConsumption and other fields are calculated for non-missing data.

It is found out that avgFuelConsumption has fair linear correlation with the attributes of Power, Weight,  VehicleFirstRegistrationYear and Premium.

Missing data is filled based on random forest classifier model since avgFuelConsumption field contains cardinal-categorical data.

In [None]:
dfMissingAvgFuelConsumption = dfModelData[dfModelData["avgFuelConsumption"].isna()]
dfNonMissingAvgFuelConsumption = dfModelData.drop(dfMissingAvgFuelConsumption.index, inplace = False)

In [None]:
aUniqueAvgFuelConsumption = dfNonMissingAvgFuelConsumption["avgFuelConsumption"].unique()

dfAvgFuelConsumptionClassified = pd.get_dummies(dfNonMissingAvgFuelConsumption["avgFuelConsumption"])
dfNonMissingAvgFuelConsumption[aUniqueAvgFuelConsumption] = dfAvgFuelConsumptionClassified

dfCorr = dfNonMissingAvgFuelConsumption[["avgFuelConsumption", "Power", "Weight", "VehicleFirstRegistrationYear", "Premium"]].corr()

sns.heatmap(dfCorr.abs(), vmin=0, vmax=1, annot = True, cmap="Greens")

In [None]:
dfX = dfNonMissingAvgFuelConsumption[["Power", "Weight", "VehicleFirstRegistrationYear", "Premium"]]
dfY = dfNonMissingAvgFuelConsumption[aUniqueAvgFuelConsumption]

X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.3, random_state=1)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

oRandForModel = RandomForestClassifier()
oRandForModel.fit(X_train, y_train)

y_pred = oRandForModel.predict(X_test)

print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

print('Recall: ', metrics.recall_score(y_test, y_pred,zero_division=0,  average = 'micro'))

print('Precision: ', metrics.precision_score(y_test, y_pred,zero_division=0,  average = 'micro'))

print('F1-Score: ', metrics.f1_score(y_test, y_pred,zero_division=0,  average = 'micro'))

aPredictedClassesForMissing = oRandForModel.predict(dfMissingAvgFuelConsumption[["Power", "Weight", "VehicleFirstRegistrationYear", "Premium"]])

dfPredictedClassesForMissing = pd.DataFrame(data = aPredictedClassesForMissing ,  columns = aUniqueAvgFuelConsumption, index = dfMissingAvgFuelConsumption.index )

sPredictedLabels = dfPredictedClassesForMissing.idxmax(axis=1)

dfModelData["avgFuelConsumption"].fillna(sPredictedLabels, inplace = True)

## Region

There are 6727 rows (aprx. 9% of whole dataset).

According to given attributes, it is difficult to build a pattern that can help to predict region.

Regional data may give information about person's financial information. That's why, this attribue is kept in model data but missing rows have been removed from dataset.

In [None]:
dfPolicyDataWithMissingRegion = dfModelData[dfModelData["Region"] == "Missing"]
dfModelData.drop(dfPolicyDataWithMissingRegion.index, inplace = True)

## FuelType

For missing fuel types, we can use policy dataset as a "vehicle" dataset where we can build a classification model to identify fuel type. Logically, fuel type is related with vehicle attributes related features such as 'VehicleType', 'VehicleUsage', 'Power', 'Weight','VehicleFirstRegistrationYear', 'Mark', 'Model'

In [None]:
ReplaceMissingDataWithClassifier(dfModelData, 
                                ['VehicleType', 'VehicleUsage', 'Mark', 'Model'], 
                                ['Power', 'Weight','VehicleFirstRegistrationYear'], 
                                'FuelType')

## DriveTrain

There are 22558 rows that have missing information for this field. Since it s a around 36% of whole data and 'DriveTrain' depends on vehicle attributes, we can replace missing data with a classifier.

In [None]:
ReplaceMissingDataWithClassifier(dfModelData, 
                                ['VehicleType', 'VehicleUsage', 'Mark', 'Model'], 
                                ['Power', 'Weight','VehicleFirstRegistrationYear'], 
                                'DriveTrain')

# FEATURE ENCODING

"One-Hot Encoding" is applied when:

The categorical feature is not ordinal.
The number of categorical features is less so one-hot encoding can be effectively applied

"Label Encoding" is applied when:

The categorical feature is ordinal.
The number of categories is quite large as one-hot encoding can lead to high memory consumption

source: 

https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

https://datascience.stackexchange.com/questions/9443/when-to-use-one-hot-encoding-vs-labelencoder-vs-dictvectorizor

In [None]:
def aCreateOneHotEncoding(dfModelData, sCategoricalFeature):
    
    dfTemp = dfModelData[dfModelData[sCategoricalFeature] == "OTHER"]
    dfTemp[sCategoricalFeature] = "Other_" + str(sCategoricalFeature)
    
    dfModelData[dfModelData[sCategoricalFeature] == "OTHER"] = dfTemp
    
    aUniqueValues = dfModelData[sCategoricalFeature].unique()
    
    dfFeatureClassified = pd.get_dummies(dfModelData[sCategoricalFeature])

    dfModelData = dfModelData.join(dfFeatureClassified, on= "policy_guid")

    dfModelData.drop([sCategoricalFeature], axis = 1 , inplace = True)

    return aUniqueValues, dfModelData
    

In [None]:
def aCreateLabelEncoding(dfModelData, sCategoricalFeature):

    oLabelEncoder = LabelEncoder()

    oLabelEncoder.fit(dfModelData[sCategoricalFeature])
    
    dfModelData[sCategoricalFeature] =oLabelEncoder.transform(dfModelData[sCategoricalFeature])
    
    return oLabelEncoder.classes_, dfModelData
    

## TARGET_LABEL

Target labels that represent late payment of invoices of corresponding policy is generated based on following criteria:

1: Paid late at least once

0: Paid always on time

In [None]:
aConditions = [
    (dfInvoiceData["due_date"] < dfInvoiceData["paid_date"]),
    (dfInvoiceData["due_date"] >= dfInvoiceData["paid_date"])
]

dfInvoiceData["isInvoiceLatePaid"] = np.select(aConditions, [1, 0])

oInvocieGroupByPolicy = dfInvoiceData[["policy_guid", "isInvoiceLatePaid"]].groupby(["policy_guid"])

dfInvoiceIssueStatistics = oInvocieGroupByPolicy.agg(["sum"])

dfInvoiceIssueStatistics = dfInvoiceIssueStatistics["isInvoiceLatePaid"] 

dfInvoiceIssueStatistics.columns = ["number_of_late_payments"]

aTargetLabels = [1, 0]
aConditions = [
    (dfInvoiceIssueStatistics["number_of_late_payments"] >= 1),
    (dfInvoiceIssueStatistics["number_of_late_payments"] == 0)
]

dfTargetLabels = pd.DataFrame(data = np.select(aConditions, aTargetLabels), index = dfInvoiceIssueStatistics.index, columns = ["TARGET_LABEL"])

dfModelData = dfTargetLabels.join(dfModelData,  on= "policy_guid")

In [None]:
dfModelData.head()

## Country

"Country" field is same in whole dataset. It is not considered in model dataset.

In [None]:
dfModelData["Country"].unique()

In [None]:
dfModelData.drop(["Country"], axis = 1 , inplace = True)

## VehicleType

By logical judgement, "VehicleType" can be explained by "Weight" and maybe "Mark" and "Model" of vehicle. That's why, this variable is excluded.

In [None]:
dfModelData.drop("VehicleType", axis = 1 , inplace = True)

## VehicleUsage

In [None]:
aVehicleUsages, dfModelData = aCreateOneHotEncoding(dfModelData, "VehicleUsage")

## Mark

Since there are big amount of marks, label encoding is used to not increase the size of dataset massively.

In [None]:
aMarks, dfModelData = aCreateLabelEncoding(dfModelData, "Mark")

## Model

Since there are big amount of models, label encoding is used to not increase the size of dataset massively.

In [None]:
aModels, dfModelData = aCreateLabelEncoding(dfModelData, "Model")

## Region

Region information is converted into geographical coordinates.

In [None]:
# instantiate a new Nominatim client
aUniqueRegions = dfModelData["Region"].unique()
dfRegionsWithCoordinates = pd.DataFrame(index = aUniqueRegions, columns = ["latitude", "longitude"])

oGeolocator = Nominatim(user_agent="tutorial")

for i in range(len(aUniqueRegions)):
    sRegion = aUniqueRegions[i]
    
    oLocation = oGeolocator.geocode(sRegion)
    fLatitue = oLocation.latitude
    fLongitude = oLocation.longitude


    dfRegionsWithCoordinates.loc[sRegion, "latitude"] = fLatitue
    dfRegionsWithCoordinates.loc[sRegion, "longitude"] = fLongitude

dfRegionsWithCoordinates.index.name = "Region"
dfRegionsWithCoordinates.reset_index(level=0, inplace=True)

In [None]:
dfRegionsWithCoordinates

In [None]:
dfModelData = pd.DataFrame(
    data = pd.merge(dfModelData,dfRegionsWithCoordinates, on='Region').values,
    index = dfModelData.index,
    columns = np.concatenate([dfModelData.columns, ['latitude', 'longitude']]))

dfModelData[['latitude', 'longitude']] = dfModelData[['latitude', 'longitude']].astype(float)

dfModelData.drop("Region", axis = 1 , inplace = True)

## Gender

In [None]:
aGenders, dfModelData = aCreateOneHotEncoding(dfModelData, "Gender")

## ClientBirthDay

Client age on policy start date is more relevant than client birthday since we are handling historical data.

In [None]:
dfModelData["ClientAgeOnPolicyStart"] = (pd.to_datetime(dfModelData["PolicyStartDate"])-pd.to_datetime(dfModelData["ClientBirthday"])).astype('<m8[Y]')
dfModelData.drop("ClientBirthday", axis = 1 , inplace = True)

## BMClassMOD

BM Class Mod looks like ordinal categorical data.

In [None]:
aBMClassMODs, dfModelData =  aCreateLabelEncoding(dfModelData, "BMClassMOD")

## PolicyIssueDate

By logical judgement, "PolicyIssueDate" is not related if a person pays late or not. 

In [None]:
dfModelData.drop("PolicyIssueDate", axis = 1 , inplace = True)

## PolicyStartDate

"PolicyStartDate" could be a reason of payment date. Especially day part of the date could be a reason. Maybe persons prefer to pay after their salary. 

This field is converted to following fields: 

1. year, month and day.

2. In addition, expected duration of policies are calculated based on difference between start and end date.

3. Vehicle age on PolicyStartDate

Since this fied is date time field it is removed from model dataset.

In [None]:
dfModelData["PolicyStartYear"] = pd.DatetimeIndex(dfModelData["PolicyStartDate"]).year

dfModelData["PolicyStartMonth"] = pd.DatetimeIndex(dfModelData["PolicyStartDate"]).month

dfModelData["PolicyStartDay"] = pd.DatetimeIndex(dfModelData["PolicyStartDate"]).day

In [None]:
dfModelData["PolicyDurationInMonths"] = (pd.to_datetime(dfModelData["PolicyEndDate"])-pd.to_datetime(dfModelData["PolicyStartDate"])).astype('<m8[M]')

In [None]:
dfModelData["VehicleAgeOnPolicyStart"] = dfModelData["PolicyStartYear"]-dfModelData["VehicleFirstRegistrationYear"]


In [None]:
dfModelData.drop("PolicyStartDate", axis = 1 , inplace = True)

## PolicyEndDate

Newly created "PolicyDurationInMonths" attribute would cover "PolicyEndDate" attribute. That's why, it is removed from model dataset

In [None]:
dfModelData.drop("PolicyEndDate", axis = 1 , inplace = True)

## PolicyActualEndDate

"PolicyActualEndDate" attribute doesn't have a meanining since the prediction algorithm will be running on current time when policy is still active. That's why "PolicyActualEndDate" is removed from model dataset.

In [None]:
dfModelData.drop("PolicyActualEndDate", axis = 1 , inplace = True)

## Channel

Channel may be related with the payment operation as well. It could be so that electronic channels may have more stable payment routine. It is observed that direct and unknown payments have highest late payment rates.

In [None]:
dfChannelStats = dfModelData[["Channel", "TARGET_LABEL"]].groupby(["Channel"]).agg(["sum", "count"])

dfChannelStats = dfChannelStats["TARGET_LABEL"]

dfChannelStats = pd.DataFrame(data = dfChannelStats.values , index  = dfChannelStats.index, columns=["policies paid late", "number of policies"])

dfChannelStats.reset_index(inplace = True)

dfChannelStats["policies paid on time"] = dfChannelStats["number of policies"]-dfChannelStats["policies paid late"]

dfChannelStats["policies paid on time (%)"] = (dfChannelStats["policies paid on time"]/dfChannelStats["number of policies"])*100

dfChannelStats["policies paid late (%)"] = (dfChannelStats["policies paid late"]/dfChannelStats["number of policies"])*100

dfChannelStats

In [None]:
aChannels, dfModelData = aCreateOneHotEncoding(dfModelData, "Channel")

## FuelType

In [None]:
aFuelTypes, dfModelData = aCreateOneHotEncoding(dfModelData, "FuelType")

## DriveTrain

In [None]:
aDriveTrains, dfModelData = aCreateOneHotEncoding(dfModelData, "DriveTrain")

## sales_type

In [None]:
aSalesTypes, dfModelData = aCreateOneHotEncoding(dfModelData, "sales_type")

Finally all data is in numerical format. 

In [None]:
dfModelData = dfModelData.astype(float)

dfModelData.head()

## Variance Inflation Factor

A rule of thumb for interpreting the variance inflation factor:

1 = not correlated.
Between 1 and 5 = moderately correlated.
Greater than 5 = highly correlated.

Source:

https://www.investopedia.com/terms/v/variance-inflation-factor.asp

https://www.statisticshowto.com/variance-inflation-factor/

## Principle Component Analysis

# PREDICTIVE MODEL

In [None]:
aFeaturesY = ["TARGET_LABEL"]
aFeaturesX = dfModelData.drop(aFeaturesY, axis = 1, inplace = False).columns


dfX = dfModelData[aFeaturesX]

dfY = dfModelData[aFeaturesY]

X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.3, random_state=1)

oScaler = StandardScaler()
X_train = oScaler.fit_transform(X_train)
X_test = oScaler.transform(X_test)

## Decision Tree Classifier

In [None]:
oDecTreeModel =  DecisionTreeClassifier()
oDecTreeModel.fit(X_train, y_train)

y_pred = oDecTreeModel.predict(X_test)

print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

print('Recall: ', metrics.recall_score(y_test, y_pred,zero_division=0,  average = 'micro'))

print('Precision: ', metrics.precision_score(y_test, y_pred,zero_division=0,  average = 'micro'))

print('F1-Score: ', metrics.f1_score(y_test, y_pred,zero_division=0,  average = 'micro'))

## Multi Layer Perception

In [None]:
oMlpModel = tf.keras.Sequential()

oMlpModel.add(tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_normal', input_shape=aFeaturesX.shape))
oMlpModel.add(tf.keras.layers.Dense(20, activation='relu', kernel_initializer='he_normal'))
oMlpModel.add(tf.keras.layers.Dense(1, activation='sigmoid'))

oMlpModel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

oMlpModel.fit(X_train, y_train, epochs=50, batch_size=128, verbose=1)

In [None]:
y_pred = oMlpModel.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)

print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

print('Recall: ', metrics.recall_score(y_test, y_pred,zero_division=0,  average = 'micro'))

print('Precision: ', metrics.precision_score(y_test, y_pred,zero_division=0,  average = 'micro'))

print('F1-Score: ', metrics.f1_score(y_test, y_pred,zero_division=0,  average = 'micro'))