# IMPORT LIBRARIES

In [1466]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

# CONFIGURATION OF LIBRARIES

In [1500]:
pd.set_option("display.max_columns", None)

InteractiveShell.ast_node_interactivity = "all"

plt.figure(figsize=(14,8))

pd.options.display.max_rows = 100

pd.options.mode.chained_assignment = None  # default='warn'

<Figure size 1008x576 with 0 Axes>

<Figure size 1008x576 with 0 Axes>

# DATA READING DATA FROM SOURCE

dfPolicyData = pd.read_csv("PolicyData.csv", delimiter = ";", encoding='latin-1')
dfPolicyData.set_index("policy_guid", inplace = True)

dfInvoiceData = pd.read_csv("InvoiceData.csv", delimiter = ";")
dfInvoiceData.set_index("invoice_guid", inplace = True)


In [1469]:
# They are converted from object to float
dfInvoiceData["amount_premium"] = dfInvoiceData["amount_premium"].apply(lambda x: x.replace(',','.'), ).astype(float, errors = 'raise')
dfPolicyData["Premium"] = dfPolicyData["Premium"].apply(lambda x: x.replace(',','.'), ).astype(float, errors = 'raise')

In [1470]:
# 1: Invoice is paid late
# 0: Invoice is not paid late
aLabels = [1, 0]
aConditions = [
    (dfInvoiceData["due_date"] < dfInvoiceData["paid_date"]),
    (dfInvoiceData["due_date"] >= dfInvoiceData["paid_date"])
]

dfInvoiceData["is_it_paid_late"] = np.select(aConditions, aLabels)

In [1471]:
# In the dataset not all policies are ended. It s a snapshot probably around June 2018.
# There could be 3 status of policies: "Ended on time", "Terminated" and "Ongoing"
# We can build model not based on policy status, but based on number of issued invoices.

oInvocieGroupByPolicy = dfInvoiceData[["policy_guid", "is_it_paid_late"]].groupby(["policy_guid"])

dfInvoiceIssueStatistics = oInvocieGroupByPolicy.agg(["count", "sum"])

dfInvoiceIssueStatistics = dfInvoiceIssueStatistics["is_it_paid_late"] 

dfInvoiceIssueStatistics.columns = ["number_of_invoices", "number_of_late_payments"]

dfInvoiceIssueStatistics["late_payment_ratio"] = dfInvoiceIssueStatistics["number_of_late_payments"]/dfInvoiceIssueStatistics["number_of_invoices"]

dfModelData = dfPolicyData.join(dfInvoiceIssueStatistics)

In [1472]:
# 1: Policy is paid late at least once
# 0: Policy is never paid late
aLabels = [1, 0]
aConditions = [
    (dfModelData["number_of_late_payments"] >= 1),
    (dfModelData["number_of_late_payments"] == 0)
]

dfModelData["is_it_paid_late"] = np.select(aConditions, aLabels)

# MISSING DATA INPUTATION

In [1474]:
# Columns that are NaN or 'Missing' in policy dataset

dfPolicyData.columns[dfPolicyData.isna().any()].tolist()

dfMissing = dfPolicyData.astype(str) == "Missing"
dfPolicyData.columns[dfMissing.any()].tolist()


['Deductible_general', 'ClientBirthday', 'BMClassMOD', 'avgFuelConsumption']

['Region', 'FuelType', 'DriveTrain']

In [1475]:
# Columns that are NaN or 'Missing' in invoice dataset.
# There is no missing data invoice dataset. 

dfInvoiceData.columns[dfInvoiceData.isna().any()].tolist()

dfMissing = dfInvoiceData.astype(str) == "Missing"
dfInvoiceData.columns[dfMissing.any()].tolist()

[]

[]

In [1476]:
# There are policies where number of invoices are greater than number of issued invoices.
dfToQuestion = dfModelData[dfModelData["number_of_invoices"] > dfModelData["Nb_of_payments"]].loc[:, ["number_of_invoices", "Nb_of_payments"]]

## Deductible_general

There are only 2 rows where Deductible_general is missing. 

Since they are relatively small amount of rows for this dataset, these rows are deleted.

In [1477]:
dfMissingDecutibleGeneral = dfModelData[dfModelData["Deductible_general"].isna()]
dfModelData.drop(dfMissingDecutibleGeneral.index, inplace = True)

## ClientBirthday

There are only 43 rows where ClientBirthday is missing.

Since they are relatively small amount of rows for this dataset, these rows are deleted.

In [1478]:
dfMissingClientBirthday = dfModelData[dfModelData["ClientBirthday"].isna()]
dfModelData.drop(dfMissingClientBirthday.index, inplace = True)

BMClassMOD:

There are only 14 rows where BMClassMOD is missing.

Since they are relatively small amount of rows for this dataset, these rows are deleted.

In [1479]:
dfMissingBmClassMod = dfModelData[dfModelData["BMClassMOD"].isna()]
dfModelData.drop(dfMissingBmClassMod.index, inplace = True)

avgFuelConsumption:

There are 27927 rows where avgFuelConsumption is missing.

Since it s a big amount of rows, correlation between avgFuelConsumption and other fields are calculated for non-missing data.

It is found out that avgFuelConsumption has fair linear correlation with the attributes of Power, Weight,  VehicleFirstRegistrationYear and Premium.

Missing data is filled based on random forest classifier model since avgFuelConsumption field contains cardinal-categorical data.

In [1480]:
# dfMissingAvgFuelConsumption = dfModelData[dfModelData["avgFuelConsumption"].isna()]
# dfNonMissingAvgFuelConsumption = dfModelData.drop(dfMissingAvgFuelConsumption.index, inplace = False)

# aUniqueAvgFuelConsumption = dfNonMissingAvgFuelConsumption["avgFuelConsumption"].unique()


# dfAvgFuelConsumptionClassified = pd.get_dummies(dfNonMissingAvgFuelConsumption["avgFuelConsumption"])
# dfNonMissingAvgFuelConsumption[aUniqueAvgFuelConsumption] = dfAvgFuelConsumptionClassified

# dfCorr = dfNonMissingAvgFuelConsumption[["avgFuelConsumption", "Power", "Weight", "VehicleFirstRegistrationYear", "Premium"]].corr()

# sns.heatmap(dfCorr.abs(), vmin=0, vmax=1, annot = True, cmap="Greens")

# dfX = dfNonMissingAvgFuelConsumption[["Power", "Weight", "VehicleFirstRegistrationYear", "Premium"]]
# dfY = dfNonMissingAvgFuelConsumption[aUniqueAvgFuelConsumption]

# X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.3, random_state=1)


# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# oRandForModel = RandomForestClassifier()
# oRandForModel.fit(X_train, y_train)

# y_pred = oRandForModel.predict(X_test)

# print('RMSE: ', metrics.mean_squared_error(y_test, y_pred, squared = False))

# print('R2: ', metrics.r2_score(y_test, y_pred))

# print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

# print('Recall: ', metrics.recall_score(y_test, y_pred,zero_division=0,  average = 'micro'))

# print('Precision: ', metrics.precision_score(y_test, y_pred,zero_division=0,  average = 'micro'))

# print('F1-Score: ', metrics.f1_score(y_test, y_pred,zero_division=0,  average = 'micro'))

# aPredictedClassesForMissing = oRandForModel.predict(dfMissingAvgFuelConsumption[["Power", "Weight", "VehicleFirstRegistrationYear", "Premium"]])

# dfPredictedClassesForMissing = pd.DataFrame(data = aPredictedClassesForMissing ,  columns = aUniqueAvgFuelConsumption, index = dfMissingAvgFuelConsumption.index )

# sPredictedLabels = dfPredictedClassesForMissing.idxmax(axis=1)

# dfModelData["avgFuelConsumption"].fillna(sPredictedLabels, inplace = True)

In [1481]:
# dfModelData["FuelType"] = dfModelData["FuelType"].astype(str).apply(
#     lambda x: x.replace('Missing','Missing_FuelType'), )

# aUniqueFuelTypes = dfModelData["FuelType"].unique()

# dfFuelTypesClassified = pd.get_dummies(dfModelData["FuelType"])
# dfModelData = dfModelData.join(dfFuelTypesClassified, on= "policy_guid")


In [1482]:
# dfModelData["DriveTrain"] = dfModelData["DriveTrain"].astype(str).apply(
#     lambda x: x.replace('Missing','Missing_DriveTrain'), )

# aUniqueDriveTrains = dfModelData["DriveTrain"].unique()

# dfDriveTrainsClassified = pd.get_dummies(dfModelData["DriveTrain"])
# dfModelData = dfModelData.join(dfDriveTrainsClassified, on= "policy_guid")


Region: 

This field is converted to latitude and longitude form.

Exploratory analysis is performed to identify relationships.

It is observed that there is no strong linear relationship between coordinates and other attributes of policy dataset.

Since there is no relationship identified for non-missing data. And since there are 6727 rows (aprx. 9% of whole dataset), rows that are missing have been removed from dataset.

In [1483]:
# from geopy.geocoders import Nominatim
# import time
# from pprint import pprint

# # instantiate a new Nominatim client
# oGeolocator = Nominatim(user_agent="tutorial")

# dfRegionsWithCoordinates = pd.DataFrame(index = aUniqueRegions, columns = ["latitude", "longitude"])

# for i in range(len(aUniqueRegions)):
#     sRegion = aUniqueRegions[i]
    
#     if sRegion != "Missing_Region":
#         oLocation = oGeolocator.geocode(sRegion)
#         fLatitue = oLocation.latitude
#         fLongitude = oLocation.longitude
#     else:
#         fLatitue = 0
#         fLongitude = 0

#     dfRegionsWithCoordinates.loc[sRegion, "latitude"] = fLatitue
#     dfRegionsWithCoordinates.loc[sRegion, "longitude"] = fLongitude

# dfRegionsWithCoordinates.index.name = "Region"
# dfRegionsWithCoordinates.reset_index(level=0, inplace=True)




In [1484]:
# dfModelData = pd.merge(dfModelData,dfRegionsWithCoordinates,on='Region')

# dfModelData[["latitude", "longitude"]] = dfModelData[["latitude", "longitude"]].astype(float, errors = 'raise')

# dfModelNonMissingRegions = dfModelData[dfModelData["Region"] != "Missing_Region"]

# aColumnsToAnalyze = np.concatenate((dfPolicyData.columns, ["latitude", "longitude"]))

# dfCorr = dfModelNonMissingRegions[aColumnsToAnalyze].corr()

# sns.heatmap(dfCorr.abs(), vmin=0, vmax=1, cmap="Greens")

In [1485]:
# dfModelNonMissingRegions.shape
# dfModelData.shape

In [1486]:
dfPolicyDataWithMissingRegion = dfModelData[dfModelData["Region"] == "Missing"]
dfModelData.drop(dfPolicyDataWithMissingRegion.index, inplace = True)


Fuel Type:

For missing fuel types, we can use policy dataset as a "vehicle" dataset where we can build a classification model to identify fuel type. Logically, fuel type is related with vehicle attributes related features such as 'VehicleType', 'VehicleUsage', 'Power', 'Weight','VehicleFirstRegistrationYear', 'Mark', 'Model'

In [1522]:
def ReplaceMissingDataWithClassifier(dfModelData, aCategoricalFeatures, aContinuousFeatures, sTargetFeature):

    dfModelDataCopy = dfModelData.copy()
    
    aFeaturesX = []
    
    for i in range(len(aCategoricalFeatures)):
        sCategoricalFeature = aCategoricalFeatures[i]

        # to avoid "other" value for multiple attributes
        dfTemp = dfModelDataCopy[dfModelDataCopy[sCategoricalFeature] == "OTHER"]
        dfTemp[sCategoricalFeature] = "Other_" + str(sCategoricalFeature)
        
        dfModelDataCopy[sCategoricalFeature] = dfTemp[sCategoricalFeature]
        
        dfCategoricalFeatureClassified = pd.get_dummies(dfModelDataCopy[sCategoricalFeature])
        
        dfModelDataCopy = dfModelDataCopy.join(dfCategoricalFeatureClassified, on= "policy_guid")
        
        aFeaturesX = np.concatenate([aFeaturesX, dfCategoricalFeatureClassified.columns])
 
        
    aFeaturesX = np.concatenate([aFeaturesX, aContinuousFeatures])

    dfTargetFeatureClassified= pd.get_dummies(dfModelDataCopy[sTargetFeature])
    
    dfModelDataCopy = dfModelDataCopy.join(dfTargetFeatureClassified, on= "policy_guid")

    aFeaturesY = np.delete(dfTargetFeatureClassified.columns, np.where(dfTargetFeatureClassified.columns == "Missing") )

    dfMissingData = dfModelDataCopy[dfModelData[sTargetFeature] == "Missing"]
    dfNonMissingData = dfModelDataCopy.drop(dfMissingData.index, inplace = False)
    
    dfX = dfNonMissingData[aFeaturesX]
    dfY = dfNonMissingData[aFeaturesY]

    X_train, X_test, y_train, y_test = train_test_split(dfX, dfY, test_size=0.3, random_state=1)

    oDecTreeModel =  DecisionTreeClassifier()
    oDecTreeModel.fit(X_train, y_train)

    y_pred = oDecTreeModel.predict(X_test)

    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

    print('Recall: ', metrics.recall_score(y_test, y_pred,zero_division=0,  average = 'micro'))

    print('Precision: ', metrics.precision_score(y_test, y_pred,zero_division=0,  average = 'micro'))

    print('F1-Score: ', metrics.f1_score(y_test, y_pred,zero_division=0,  average = 'micro'))
    
    dfX_Missing = dfMissingData[aFeaturesX]
    aPredictionsForMissing = oDecTreeModel.predict(dfX_Missing)
    dfPredictionsForMissing = pd.DataFrame(data = aPredictionsForMissing, columns = aFeaturesY, index = dfMissingData.index)
    
    aPredictedLabels = dfPredictionsForMissing.idxmax(axis=1)

    dfModelData.loc[dfMissingData.index,sTargetFeature] = aPredictedLabels

In [1514]:
ReplaceMissingDataWithClassifier(dfModelData, 
                                ['VehicleType', 'VehicleUsage', 'Mark', 'Model'], 
                                ['Power', 'Weight','VehicleFirstRegistrationYear'], 
                                'FuelType')

Accuracy:  0.9784807281717047
Recall:  0.9784807281717047
Precision:  0.9804639117216529
F1-Score:  0.9794713160854893


DriveTrain:

There are 22558 rows that have missing information for this field. Since it s a around 36% of whole data and 'DriveTrain' depends on vehicle attributes, we can replace missing data with a classifier.

In [1523]:
ReplaceMissingDataWithClassifier(dfModelData, 
                                ['VehicleType', 'VehicleUsage', 'Mark', 'Model'], 
                                ['Power', 'Weight','VehicleFirstRegistrationYear'], 
                                'DriveTrain')

Accuracy:  0.9669183136822461
Recall:  0.9669183136822461
Precision:  0.9727769243898277
F1-Score:  0.9698387714297869
