In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [38]:
#load the insurance data into a dataframe
insurance=pd.read_csv("/Users/babatundesalako/Downloads/Insurance_Policies.csv")
insurance.head()

Unnamed: 0,ID,birthdate,marital_status,car_use,gender,kids_driving,parent,education,car_make,car_model,car_color,car_year,claim_freq,coverage_zone,claim_amt,household_income
0,62-2999778,8/9/1962,Single,Private,Male,2,Yes,High School,Acura,TSX,Green,2010,1,Highly Urban,$73759.88,$220436.66
1,70-2426103,4/21/1988,Married,Private,Female,0,No,Bachelors,Corbin,Sparrow,Turquoise,2004,1,Urban,$78975.41,$66491.43
2,08-3808219,3/8/1999,Divorced,Private,Male,0,No,Bachelors,Nissan,Pathfinder,Orange,1993,0,Rural,$30904.01,$56122.70
3,38-0306843,5/10/1959,Single,Private,Female,0,No,Bachelors,Ford,Econoline E350,Pink,2000,1,Highly Urban,$30257.82,$175182.61
4,47-5163637,1/15/1992,Single,Commercial,Male,0,No,Masters,Nissan,350Z,Green,2006,3,Rural,$50434.02,$137110.23


In [39]:
#finding out the number of records and columns
insurance.shape

(37542, 16)

In [40]:
#a look into the columns in the dataset
insurance.columns

Index(['ID', 'birthdate', 'marital_status', 'car_use', 'gender',
       'kids_driving', 'parent', 'education', 'car_make', 'car_model',
       'car_color', 'car_year', 'claim_freq', 'coverage_zone', 'claim_amt',
       'household_income'],
      dtype='object')

In [34]:
#get more information about the data
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37542 entries, 0 to 37541
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                37542 non-null  object
 1   birthdate         37542 non-null  object
 2   marital_status    37542 non-null  object
 3   car_use           37542 non-null  object
 4   gender            37542 non-null  object
 5   kids_driving      37542 non-null  int64 
 6   parent            37542 non-null  object
 7   education         37542 non-null  object
 8   car_make          37542 non-null  object
 9   car_model         37542 non-null  object
 10  car_color         37542 non-null  object
 11  car_year          37542 non-null  int64 
 12  claim_freq        37542 non-null  int64 
 13  coverage_zone     37542 non-null  object
 14  claim_amt         37542 non-null  object
 15  household_income  37542 non-null  object
dtypes: int64(3), object(13)
memory usage: 4.6+ MB


In [26]:
insurance.isna().sum()

ID                  0
birthdate           0
marital_status      0
car_use             0
gender              0
kids_driving        0
parent              0
education           0
car_make            0
car_model           0
car_color           0
car_year            0
claim_freq          0
coverage_zone       0
claim_amt           0
household_income    0
dtype: int64

In [7]:
insurance.duplicated().sum()

0

In [8]:
missing_percentage = insurance.isnull().sum() / len(insurance) * 100
print(missing_percentage)

ID                  0.0
birthdate           0.0
marital_status      0.0
car_use             0.0
gender              0.0
kids_driving        0.0
parent              0.0
education           0.0
car_make            0.0
car_model           0.0
car_color           0.0
car_year            0.0
claim_freq          0.0
coverage_zone       0.0
claim_amt           0.0
household_income    0.0
dtype: float64


## EDA

In [41]:
insurance.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
kids_driving,37542.0,0.418278,0.736958,0.0,0.0,0.0,1.0,3.0
car_year,37542.0,2000.293005,9.045441,1909.0,1995.0,2002.0,2007.0,2013.0
claim_freq,37542.0,0.510308,1.01505,0.0,0.0,0.0,1.0,4.0


In [42]:
insurance.describe(include='object').T

Unnamed: 0,count,unique,top,freq
ID,37542,37541,56-5402470,2
birthdate,37542,16525,10/23/1981,9
marital_status,37542,4,Single,15525
car_use,37542,2,Private,30060
gender,37542,2,Female,18806
parent,37542,2,No,20932
education,37542,4,Bachelors,18701
car_make,37542,78,Ford,3302
car_model,37542,1011,Grand Prix,250
car_color,37542,19,Turquoise,2078


In [45]:
#removing the $ sign from the values in the columns
insurance['claim_amt']=insurance['claim_amt'].str.replace('$', '')
insurance['household_income']=insurance['household_income'].str.replace('$', '')
print(insurance['claim_amt'].head(4))
print(insurance['household_income'].head(4))

0    73759.88
1    78975.41
2    30904.01
3    30257.82
Name: claim_amt, dtype: object
0    220436.66
1     66491.43
2     56122.70
3    175182.61
Name: household_income, dtype: object


In [46]:
#checning from from object datatype to float data type
insurance['claim_amt']=insurance['claim_amt'].astype(float)
insurance['household_income']=insurance['household_income'].astype(float)


In [47]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37542 entries, 0 to 37541
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                37542 non-null  object 
 1   birthdate         37542 non-null  object 
 2   marital_status    37542 non-null  object 
 3   car_use           37542 non-null  object 
 4   gender            37542 non-null  object 
 5   kids_driving      37542 non-null  int64  
 6   parent            37542 non-null  object 
 7   education         37542 non-null  object 
 8   car_make          37542 non-null  object 
 9   car_model         37542 non-null  object 
 10  car_color         37542 non-null  object 
 11  car_year          37542 non-null  int64  
 12  claim_freq        37542 non-null  int64  
 13  coverage_zone     37542 non-null  object 
 14  claim_amt         37542 non-null  float64
 15  household_income  37542 non-null  float64
dtypes: float64(2), int64(3), object(11)
memo

In [48]:
insurance.head(5)

Unnamed: 0,ID,birthdate,marital_status,car_use,gender,kids_driving,parent,education,car_make,car_model,car_color,car_year,claim_freq,coverage_zone,claim_amt,household_income
0,62-2999778,8/9/1962,Single,Private,Male,2,Yes,High School,Acura,TSX,Green,2010,1,Highly Urban,73759.88,220436.66
1,70-2426103,4/21/1988,Married,Private,Female,0,No,Bachelors,Corbin,Sparrow,Turquoise,2004,1,Urban,78975.41,66491.43
2,08-3808219,3/8/1999,Divorced,Private,Male,0,No,Bachelors,Nissan,Pathfinder,Orange,1993,0,Rural,30904.01,56122.7
3,38-0306843,5/10/1959,Single,Private,Female,0,No,Bachelors,Ford,Econoline E350,Pink,2000,1,Highly Urban,30257.82,175182.61
4,47-5163637,1/15/1992,Single,Commercial,Male,0,No,Masters,Nissan,350Z,Green,2006,3,Rural,50434.02,137110.23
