In [None]:
import pandas as pd
import numpy as np
import io
import gc
import time
import PIL.Image as Image
import matplotlib.pylab as plt
from datetime import date

import tensorflow as tf
import tensorflow_hub as hub

# settings
import warnings
warnings.filterwarnings("ignore")
gc.enable()

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
gDriveTrainImages = '/content/drive/MyDrive/Datasets/Hackerearth_vehicle_insurance_claim/dataset/trainImages/'
gDriveTestImages = '/content/drive/MyDrive/Datasets/Hackerearth_vehicle_insurance_claim/dataset/testImages/'
gDrivePath = '/content/drive/MyDrive/Datasets/Hackerearth_vehicle_insurance_claim/dataset/'

In [None]:
df_train = pd.read_csv(gDrivePath+'train.csv')
df_train.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,Condition,Amount
0,img_4513976.jpg,BQ,41500.0,1037.5,2026-12-03,36142.68,0,0.0
1,img_7764995.jpg,BQ,50700.0,1267.5,2025-07-10,12753.0,1,6194.0
2,img_451308.jpg,A,49500.0,1237.5,2022-08-11,43102.68,0,0.0
3,img_7768372.jpg,A,33500.0,837.5,2022-08-02,8453.0,1,7699.0
4,img_7765274.jpg,AC,27600.0,690.0,2026-05-01,6978.0,1,8849.0


In [None]:
df_test = pd.read_csv(gDrivePath+'test.csv')
df_test.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage
0,img_4538519.jpg,B,23600,590.0,2025-04-12,5978.0
1,img_7766002.jpg,C,28300,707.5,2028-08-24,7153.0
2,img_4637390.jpg,AC,43700,1092.5,2023-11-28,11003.0
3,img_4516108.jpg,BB,46100,1152.5,2028-02-04,11603.0
4,img_4517008.jpg,BB,40700,1017.5,2022-01-03,10253.0


In [None]:
df_train.dtypes

Image_path            object
Insurance_company     object
Cost_of_vehicle      float64
Min_coverage         float64
Expiry_date           object
Max_coverage         float64
Condition              int64
Amount               float64
dtype: object

In [None]:
df_train.isnull().any()

Image_path           False
Insurance_company    False
Cost_of_vehicle       True
Min_coverage          True
Expiry_date          False
Max_coverage          True
Condition            False
Amount                True
dtype: bool

In [None]:
# Preprocess Date field
df_train['Expiry_date'] = pd.to_datetime(df_train['Expiry_date'])
df_test['Expiry_date'] = pd.to_datetime(df_test['Expiry_date'])

# df_train.sort_values(by=['Expiry_date'], inplace=True)

# resetting the dataframe index to new time sorted values
# df_train.reset_index(drop=True, inplace=True)
df_train.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,Condition,Amount
0,img_4513976.jpg,BQ,41500.0,1037.5,2026-12-03,36142.68,0,0.0
1,img_7764995.jpg,BQ,50700.0,1267.5,2025-07-10,12753.0,1,6194.0
2,img_451308.jpg,A,49500.0,1237.5,2022-08-11,43102.68,0,0.0
3,img_7768372.jpg,A,33500.0,837.5,2022-08-02,8453.0,1,7699.0
4,img_7765274.jpg,AC,27600.0,690.0,2026-05-01,6978.0,1,8849.0


In [None]:
# date_feature_extraction extracts features from date type
def date_feature_extraction(df, field):
  df['year'] = df[field].dt.year
  df['month'] = df[field].dt.month
  df['day'] = df[field].dt.day
  df['dayofweek'] = df[field].dt.dayofweek
  df['week'] = df[field].dt.week
  df['weekofyear'] = df[field].dt.weekofyear
  df['dayofyear'] = df[field].dt.dayofyear
  df['days_in_month'] = df[field].dt.days_in_month

  # Years until current_year
  current_year = date.today().year
  df['currentYear-year'] = df['year'].map(lambda year: current_year-year)

In [None]:
date_feature_extraction(df_train, 'Expiry_date')
date_feature_extraction(df_test, 'Expiry_date')
df_train.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,Condition,Amount,year,month,day,dayofweek,week,weekofyear,dayofyear,days_in_month,currentYear-year
0,img_4513976.jpg,BQ,41500.0,1037.5,2026-12-03,36142.68,0,0.0,2026,12,3,3,49,49,337,31,-5
1,img_7764995.jpg,BQ,50700.0,1267.5,2025-07-10,12753.0,1,6194.0,2025,7,10,3,28,28,191,31,-4
2,img_451308.jpg,A,49500.0,1237.5,2022-08-11,43102.68,0,0.0,2022,8,11,3,32,32,223,31,-1
3,img_7768372.jpg,A,33500.0,837.5,2022-08-02,8453.0,1,7699.0,2022,8,2,1,31,31,214,31,-1
4,img_7765274.jpg,AC,27600.0,690.0,2026-05-01,6978.0,1,8849.0,2026,5,1,4,18,18,121,31,-5


In [None]:
# unique entries in Insurance_company
df_train['Insurance_company'].nunique()

11

### Encoding Categorical Variables

In [None]:
from sklearn import preprocessing

categorical_columns_list = ['Insurance_company']

label_object = {}
categorical_columns = categorical_columns_list
for col in categorical_columns:
    labelencoder = preprocessing.LabelEncoder()
    labelencoder.fit(df_train[col].astype(str))
    df_train[col] = labelencoder.transform(df_train[col].astype(str))
    df_test[col] = labelencoder.transform(df_test[col].astype(str))
    label_object[col] = labelencoder



# Sample inverse_transform
label_object['Insurance_company'].inverse_transform(df_train['Insurance_company'][:5])

array(['BQ', 'BQ', 'A', 'A', 'AC'], dtype=object)

In [None]:
df_train.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,Condition,Amount,year,month,day,dayofweek,week,weekofyear,dayofyear,days_in_month,currentYear-year
0,img_4513976.jpg,6,41500.0,1037.5,2026-12-03,36142.68,0,0.0,2026,12,3,3,49,49,337,31,-5
1,img_7764995.jpg,6,50700.0,1267.5,2025-07-10,12753.0,1,6194.0,2025,7,10,3,28,28,191,31,-4
2,img_451308.jpg,0,49500.0,1237.5,2022-08-11,43102.68,0,0.0,2022,8,11,3,32,32,223,31,-1
3,img_7768372.jpg,0,33500.0,837.5,2022-08-02,8453.0,1,7699.0,2022,8,2,1,31,31,214,31,-1
4,img_7765274.jpg,2,27600.0,690.0,2026-05-01,6978.0,1,8849.0,2026,5,1,4,18,18,121,31,-5


In [None]:
df_train.to_csv('train_preprocessed.csv', index=False)
df_test.to_csv('test_preprocessed.csv', index=False)

df_train = pd.read_csv('train_preprocessed.csv')
df_test = pd.read_csv('test_preprocessed.csv')

In [None]:
df_train.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,Condition,Amount,year,month,day,dayofweek,week,weekofyear,dayofyear,days_in_month,currentYear-year
0,img_4513976.jpg,6,41500.0,1037.5,2026-12-03,36142.68,0,0.0,2026,12,3,3,49,49,337,31,-5
1,img_7764995.jpg,6,50700.0,1267.5,2025-07-10,12753.0,1,6194.0,2025,7,10,3,28,28,191,31,-4
2,img_451308.jpg,0,49500.0,1237.5,2022-08-11,43102.68,0,0.0,2022,8,11,3,32,32,223,31,-1
3,img_7768372.jpg,0,33500.0,837.5,2022-08-02,8453.0,1,7699.0,2022,8,2,1,31,31,214,31,-1
4,img_7765274.jpg,2,27600.0,690.0,2026-05-01,6978.0,1,8849.0,2026,5,1,4,18,18,121,31,-5


In [None]:
df_test.head()

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,year,month,day,dayofweek,week,weekofyear,dayofyear,days_in_month,currentYear-year
0,img_4538519.jpg,3,23600,590.0,2025-04-12,5978.0,2025,4,12,5,15,15,102,30,-4
1,img_7766002.jpg,7,28300,707.5,2028-08-24,7153.0,2028,8,24,3,34,34,237,31,-7
2,img_4637390.jpg,2,43700,1092.5,2023-11-28,11003.0,2023,11,28,1,48,48,332,30,-2
3,img_4516108.jpg,4,46100,1152.5,2028-02-04,11603.0,2028,2,4,4,5,5,35,29,-7
4,img_4517008.jpg,4,40700,1017.5,2022-01-03,10253.0,2022,1,3,0,1,1,3,31,-1
