# Data Preprocessing and Modeling

This notebook demonstrates the data preprocessing and modeling steps for the Insurance Premium Prediction project.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Add the parent directory to the path so we can import the package
sys.path.append(os.path.abspath('..'))

## 1. Load the Data

We'll load the insurance premium data from the Excel file.

In [2]:
# Load the data
data_path = '../data/premiums.xlsx'
df = pd.read_excel(data_path)

# Rename columns with spaces to use underscores
if 'Number Of Dependants' in df.columns:
    df = df.rename(columns={'Number Of Dependants': 'Number_Of_Dependants'})

if 'Medical History' in df.columns:
    df = df.rename(columns={'Medical History': 'Medical_History'})
    
# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")