## EDA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


In [2]:
df = pd.read_csv('preprocessed_data.csv')
df.head()


Unnamed: 0,CUSTOMER_ID,SHOPPING_PT,RECORD_TYPE,DAY,TIME,STATE,LOCATION,GROUP_SIZE,HOMEOWNER,CAR_AGE,...,C_PREVIOUS,DURATION_PREVIOUS,A,B,C,D,E,F,G,COST
0,10000000,1,0.0,0.0,08:35:00,IN,10001.0,2.0,0.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,2.0,633
1,10000000,2,0.0,0.0,08:38:00,IN,10001.0,2.0,0.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630
2,10000000,3,0.0,0.0,08:38:00,IN,10001.0,2.0,0.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630
3,10000000,4,0.0,0.0,08:39:00,IN,10001.0,2.0,0.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630
4,10000000,5,0.0,0.0,11:55:00,IN,10001.0,2.0,0.0,2.0,...,1.0,2.0,1.0,0.0,2.0,2.0,1.0,2.0,1.0,630


In [3]:
df.columns

Index(['CUSTOMER_ID', 'SHOPPING_PT', 'RECORD_TYPE', 'DAY', 'TIME', 'STATE',
       'LOCATION', 'GROUP_SIZE', 'HOMEOWNER', 'CAR_AGE', 'CAR_VALUE',
       'RISK_FACTOR', 'AGE_OLDEST', 'AGE_YOUNGEST', 'MARRIED_COUPLE',
       'C_PREVIOUS', 'DURATION_PREVIOUS', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'COST'],
      dtype='object')

DISTRIBUTION OF FEATURES

In [None]:
# Distribution of RECORD_TYPE by risk factor using seaborn
plt.figure(figsize=(10, 6))
sns.boxplot(x='RECORD_TYPE', y='COST', data=df)
plt.title('Distribution of Cost by Risk Factor')
plt.show()

In [None]:
# Pairplot using seaborn to visualize relationships between variables
sns.pairplot(df[['AGE_OLDEST', 'AGE_YOUNGEST', 'CAR_AGE', 'COST']])
plt.show()

In [None]:
# Plotly interactive scatter plot
fig = px.scatter(df, x='AGE_OLDEST', y='COST', color='RISK_FACTOR', 
                 title='Cost vs Age of Oldest Customer by Risk Factor')
fig.show()

In [None]:
# Distribution of cost by risk factor using seaborn
plt.figure(figsize=(10, 6))
sns.boxplot(x='RISK_FACTOR', y='COST', data=df)
plt.title('Distribution of Cost by Risk Factor')
plt.show()

In [None]:
# Interactive plot for car value
fig = px.bar(df, x='CAR_VALUE', y='COST', 
                   title='Cost Distribution by Car Value')
fig.show()

In [None]:
# Bar plot for coverage options using seaborn
plt.figure(figsize=(12, 6))
sns.countplot(x='A', data=df, palette='viridis')
plt.title('Count of Coverage Option A')
plt.show()

In [None]:
# Interactive bar plot for STATE vs COST
fig = px.bar(df, x='STATE', y='COST', title='Average Cost by State')
fig.show()

In [None]:
# Data Distribution of target column
plt.hist(df['COST']); plt.title('Cost of Insurance Policy')
plt.xlabel('Cost')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Bivariate analysis: Correlation matrix and heatmap
plt.figure(figsize=(20, 12))
sns.heatmap(df.drop(['TIME','STATE','CAR_VALUE'], axis =1).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Univariate analysis: Bar plots for categorical columns
disc_cols = ['RECORD_TYPE', 'DAY','GROUP_SIZE', 'HOMEOWNER','RISK_FACTOR','MARRIED_COUPLE','C_PREVIOUS', 'DURATION_PREVIOUS', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
cat_cols = df.select_dtypes(include=['category']).columns.tolist()
merged_cat_cols = disc_cols + cat_cols

for col in merged_cat_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col)
    plt.title(f'Count of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Univariate analysis: Histogram for numerical columns
num_cols = ['COST','AGE_YOUNGEST','AGE_OLDEST','CAR_AGE','LOCATION']
for col in num_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Box plots for categorical variables vs. 'cost'
for col in merged_cat_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x=col, y='COST')
    plt.title(f'{col} vs. Cost')
    plt.xlabel(col)
    plt.ylabel('COST')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Bivariate analysis: Scatter plots for numeric variables vs. 'cost'
for col in num_cols:
    if col != 'COST':
        plt.figure(figsize=(8, 4))
        sns.scatterplot(data=df, x=col, y='COST')
        plt.title(f'{col} vs. COST')
        plt.xlabel(col)
        plt.ylabel('COST')
        plt.show()
