# 🧹 Auto Insurance Fraud Detection - Data Cleaning & Preprocessing

## 📋 Objectives:
1. **Data Cleaning** - Remove redundant data, handle duplicates
2. **Missing Value Analysis** - Identify and handle missing values
3. **Outlier Detection** - Identify and treat outliers
4. **Data Transformation** - Apply necessary transformations
5. **Feature Engineering** - Create 5 new meaningful variables
6. **Ordinal Encoding** - Handle categorical variables with proper ordering

---

## 📦 Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


## 📊 Load Data

In [2]:
# Load training datasets
print("📥 Loading datasets...")
train1 = pd.read_csv('dataset/Auto Insurance Fraud Claims (1).csv')
train2 = pd.read_csv('dataset/Auto Insurance Fraud Claims 02.csv')

# Combine training data
df = pd.concat([train1, train2], ignore_index=True)

print(f"✅ Dataset 1 shape: {train1.shape}")
print(f"✅ Dataset 2 shape: {train2.shape}")
print(f"✅ Combined dataset shape: {df.shape}")
print(f"\n📋 Columns: {list(df.columns)}")

📥 Loading datasets...
✅ Dataset 1 shape: (40000, 53)
✅ Dataset 2 shape: (20000, 53)
✅ Combined dataset shape: (60000, 53)

📋 Columns: ['Claim_ID', 'Bind_Date1', 'Customer_Life_Value1', 'Age_Insured', 'Policy_Num', 'Policy_State', 'Policy_Start_Date', 'Policy_Expiry_Date', 'Policy_BI', 'Policy_Ded', 'Policy_Premium', 'Umbrella_Limit', 'Insured_Zip', 'Gender', 'Education', 'Occupation', 'Hobbies', 'Insured_Relationship', 'Capital_Gains', 'Capital_Loss', 'Garage_Location', 'Accident_Date', 'Accident_Type', 'Collision_Type', 'Accident_Severity', 'authorities_contacted', 'Acccident_State', 'Acccident_City', 'Accident_Location', 'Accident_Hour', 'Num_of_Vehicles_Involved', 'Property_Damage', 'Bodily_Injuries', 'Witnesses', 'Police_Report', 'DL_Expiry_Date', 'Claims_Date', 'Auto_Make', 'Auto_Model', 'Auto_Year', 'Vehicle_Color', 'Vehicle_Cost', 'Annual_Mileage', 'DiffIN_Mileage', 'Low_Mileage_Discount', 'Fraud_Ind', 'Commute_Discount', 'Total_Claim', 'Injury_Claim', 'Property_Claim', 'Vehicle

## 🔍 Initial Data Exploration

In [3]:
# Basic info
print("📊 DATASET OVERVIEW")
print("=" * 50)
print(f"Total rows: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display first few rows
print("\n🔍 First 5 rows:")
df.head()

📊 DATASET OVERVIEW
Total rows: 60,000
Total columns: 53
Memory usage: 107.18 MB

🔍 First 5 rows:


Unnamed: 0,Claim_ID,Bind_Date1,Customer_Life_Value1,Age_Insured,Policy_Num,Policy_State,Policy_Start_Date,Policy_Expiry_Date,Policy_BI,Policy_Ded,...,DiffIN_Mileage,Low_Mileage_Discount,Fraud_Ind,Commute_Discount,Total_Claim,Injury_Claim,Property_Claim,Vehicle_Claim,Vehicle_Registration,Check_Point
0,AA00000001,1/1/2023,12,28,123790687,OH,10/13/2023,4/13/2024,500/1000,1000,...,2499,0,N,0,6162.56,714.94,5370.74,76.88,IF47V1395,No
1,AA00000002,1/1/2023,12,31,129044473,IL,10/21/2023,4/21/2024,250/500,500,...,4242,1,N,0,20402.38,7669.31,5708.22,7024.85,EI51L7783,No
2,AA00000003,7/1/2022,18,50,146863149,OH,11/26/2023,5/26/2024,500/1000,500,...,2346,0,N,0,10839.12,3646.11,3468.94,3724.07,MU37B8905,No
3,AA00000004,1/1/2023,12,37,163100869,IL,8/8/2023,2/8/2024,500/1000,1000,...,2425,0,Y,0,17423.88,5585.62,1863.46,9974.8,RI52Q2108,No
4,AA00000005,3/1/2022,22,28,185582958,OH,11/12/2023,5/12/2024,250/500,1000,...,3890,1,N,0,24527.38,7224.79,3074.12,14228.47,UX39O9355,No


In [4]:
# Data types and basic statistics
print("📈 DATA TYPES & BASIC STATS")
print("=" * 50)
print("\n🏷️ Data Types:")
print(df.dtypes.value_counts())

print("\n📊 Basic Statistics:")
df.describe()

📈 DATA TYPES & BASIC STATS

🏷️ Data Types:
object     30
int64      17
float64     6
Name: count, dtype: int64

📊 Basic Statistics:


Unnamed: 0,Customer_Life_Value1,Age_Insured,Policy_Num,Policy_Ded,Policy_Premium,Umbrella_Limit,Insured_Zip,Capital_Gains,Capital_Loss,Accident_Hour,...,Auto_Year,Vehicle_Cost,Annual_Mileage,DiffIN_Mileage,Low_Mileage_Discount,Commute_Discount,Total_Claim,Injury_Claim,Property_Claim,Vehicle_Claim
count,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,...,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0,60000.0
mean,17.993433,38.89985,154926900.0,1154.2,1253.486558,1133133.0,500566.05585,24991.821667,-26471.113333,11.708683,...,2019.500267,13680.161366,11490.384167,4996.3825,0.192467,0.0194,13952.779228,4633.471512,4667.969712,4651.338005
std,3.741153,9.14645,25913340.0,616.261868,245.439288,2319475.0,71379.853902,27779.612687,27910.88518,6.895567,...,2.845728,6996.715399,3751.286248,1736.607541,0.394241,0.137927,11765.455235,5782.412729,5820.557692,5912.908683
min,12.0,19.0,111178900.0,500.0,433.33,0.0,430104.0,0.0,-111100.0,0.0,...,2015.0,4114.85,5000.0,2000.0,0.0,0.0,500.12,0.0,0.11,0.05
25%,15.0,32.0,131697600.0,500.0,1083.72,0.0,448436.0,0.0,-51200.0,6.0,...,2017.0,7867.1725,8246.0,3482.75,0.0,0.0,6689.12,1058.675,1057.9225,1062.1575
50%,18.0,38.0,154616800.0,1000.0,1259.02,0.0,466289.0,0.0,-22300.0,12.0,...,2020.0,12526.53,11483.0,4996.0,0.0,0.0,12969.67,2925.735,2944.9,2926.235
75%,21.0,44.0,177575400.0,2000.0,1412.31,0.0,602942.0,51000.0,0.0,17.0,...,2022.0,17650.7225,14724.0,6499.0,0.0,0.0,19287.4425,6447.66,6481.845,6416.61
max,24.0,64.0,199947500.0,2000.0,2047.59,10000000.0,620962.0,100500.0,0.0,23.0,...,2024.0,53428.96,18000.0,8000.0,1.0,1.0,124800.15,110881.29,112952.95,123443.92
