In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import KFold
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_log_error

train = pd.read_csv("train.csv")
train.drop(columns=['id'], inplace=True)
test = pd.read_csv("test.csv")
test.drop(columns=['id'], inplace=True)

print("Train Details\n")
print(f"Train Data Info: {train.info()}")
print(f"N/A values: {train.isnull().sum()}")
print(f"Train data description: {train.describe()}")
for column in train.select_dtypes(include=['object']).columns:
    unique_values = train[column].unique()
    print(f"\nUnique values in column '{column}': {unique_values}")
display(train.head())

print("Test Details\n\n")
print(f"Test Data Info: {test.info()}")
print(f"N/A Values: {test.isnull().sum()}")
print(f"Test data description: {test.describe()}")
for column in test.select_dtypes(include=['object']).columns:
    unique_values = test[column].unique()
    print(f"\nUnique values in column '{column}': {unique_values}")
display(test.head())

In [None]:
# Histograms of raw data
for col in train.select_dtypes(include=['float64']).columns:
    data = train[col].dropna()

    # Descriptive statistics
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)

    plt.figure(figsize=(15, 8))
    sns.histplot(x=data, kde=True, bins=25)

    # Vertical lines
    plt.axvline(data.mean(), linestyle='--', linewidth=2, label=f"Mean: {data.mean():.2f}")
    plt.axvline(data.median(), linestyle='-', linewidth=2, label=f"Median: {data.median():.2f}")
    plt.axvline(data.min(), linestyle='-', linewidth=2, label=f"Min: {data.min():.2f}")
    plt.axvline(data.quantile(0.25), linestyle=':', linewidth=2, label=f"Q1: {data.quantile(0.25):.2f}")
    plt.axvline(data.quantile(0.75), linestyle=':', linewidth=2, label=f"Q3: {data.quantile(0.75):.2f}")
    plt.axvline(data.max(), linestyle='-', linewidth=2, label=f"Max: {data.max():.2f}")
    

    # Stats text box
    stats_text = (
        f"Mean: {data.mean():.2f}\n"
        f"Median: {data.median():.2f}\n"
        f"Std Dev: {data.std():.2f}\n"
        f"Q1: {q1:.2f}\n"
        f"Q3: {q3:.2f}"
    )

    plt.text(
        0.98, 0.95, stats_text,
        transform=plt.gca().transAxes,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle='round', alpha=0.3)
    )

    plt.title(f"Distribution of {col}")
    plt.legend()
    plt.show()

In [None]:
def fill_na(df):
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Annual Income'].fillna(df['Annual Income'].median(), inplace=True)
    df['Number of Dependents'].fillna(df['Number of Dependents'].median(), inplace=True)
    df['Health Score'].fillna(df['Health Score'].mean(), inplace=True)
    df['Previous Claims'].fillna(df['Previous Claims'].median(), inplace=True)
    df['Vehicle Age'].fillna(df['Vehicle Age'].median(), inplace=True)
    df['Credit Score'].fillna(df['Credit Score'].mean(), inplace=True)
    df['Insurance Duration'].fillna(df['Insurance Duration'].median(), inplace=True)
    df['Customer Feedback'].fillna(df['Customer Feedback'].mode()[0], inplace=True)
    df['Marital Status'].fillna(df['Marital Status'].mode()[0], inplace=True)
    df['Occupation'].fillna(df['Occupation'].mode()[0], inplace=True)
    return df
train = fill_na(train)
test = fill_na(test)

In [None]:
# After filling NA values Histograms
for col in train.select_dtypes(include=['float64']).columns:
    data = train[col].dropna()

    # Descriptive statistics
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)

    plt.figure(figsize=(15, 8))
    sns.histplot(x=data, kde=True, bins=25)

    # Vertical lines
    plt.axvline(data.mean(), linestyle='--', linewidth=2, label=f"Mean: {data.mean():.2f}")
    plt.axvline(data.median(), linestyle='-', linewidth=2, label=f"Median: {data.median():.2f}")
    plt.axvline(data.min(), linestyle='-', linewidth=2, label=f"Min: {data.min():.2f}")
    plt.axvline(data.quantile(0.25), linestyle=':', linewidth=2, label=f"Q1: {data.quantile(0.25):.2f}")
    plt.axvline(data.quantile(0.75), linestyle=':', linewidth=2, label=f"Q3: {data.quantile(0.75):.2f}")
    plt.axvline(data.max(), linestyle='-', linewidth=2, label=f"Max: {data.max():.2f}")
    
    # Stats text box
    stats_text = (
        f"Mean: {data.mean():.2f}\n"
        f"Median: {data.median():.2f}\n"
        f"Std Dev: {data.std():.2f}\n"
        f"Q1: {q1:.2f}\n"
        f"Q3: {q3:.2f}"
    )

    plt.text(
        0.98, 0.95, stats_text,
        transform=plt.gca().transAxes,
        verticalalignment='top',
        horizontalalignment='right',
        bbox=dict(boxstyle='round', alpha=0.3)
    )

    plt.title(f"Distribution of {col}")
    plt.legend()
    plt.show()

In [None]:
def binary_encode(df):
    df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})
    df['Smoking Status'] = df['Smoking Status'].map({'No': 0, 'Yes': 1})
    return df
train = binary_encode(train)
test = binary_encode(test)

In [None]:
def create_features(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
    df['year'] = df['Policy Start Date'].dt.year
    df['month'] = df['Policy Start Date'].dt.month
    df['day'] = df['Policy Start Date'].dt.day
    df['dow'] = df['Policy Start Date'].dt.dayofweek
    df['is_weekend'] = (df['dow'] >= 5).astype(int)
    
    df['log_income'] = np.log1p(df['Annual Income'])
    df['income_per_age'] = df['Annual Income'] / (df['Age'] + 1)
    df['income_per_dependent'] = df['Annual Income'] / (df['Number of Dependents'] + 1)
    df['high_income'] = (df['Annual Income'] > 100000).astype(int)
    
    df['age_group'] = pd.cut(df['Age'], bins=[0, 30, 41, 53, 100], labels=[0,1,2,3]).astype(int)
    df['age_times_log_income'] = df['Age'] * df['log_income']
    df['age_times_dependents'] = df['Age'] * df['Number of Dependents']
    
    df['smoker'] = (df['Smoking Status'] == '1').astype(int)
    df['sedentary'] = (df['Exercise Frequency'] == 'Rarely').astype(int)
    df['risk_score'] = df['smoker']*3 + df['sedentary']*2 + (df['Health Score'] < 20).astype(int)*2
    
    df['creditworthiness'] = pd.qcut(df['Credit Score'], q=10, labels=False, duplicates='drop')
    df['new_car'] = (df['Vehicle Age'] <= 3).astype(int)
    df['long_duration'] = (df['Insurance Duration'] >= 5).astype(int)
    df['many_claims'] = (df['Previous Claims'] >= 3).astype(int)
    df['top_policy'] = (df['Policy Type'] == 'Premium').astype(int)
    
    df['log_income_times_risk'] = df['log_income'] * df['risk_score']
    df['age_times_risk'] = df['Age'] * df['risk_score']
    df['credit_times_log_income'] = df['Credit Score'] * df['log_income']
    df['health_times_log_income'] = df['Health Score'] * df['log_income']
    
    freq_cols = ['Gender','Marital Status','Education Level','Occupation','Location', 'Policy Type','Property Type','Smoking Status','Exercise Frequency']
    
    for col in freq_cols:
        df[f'{col}_freq'] = df[col].map(df[col].value_counts(normalize=True))
    
    df = df.drop(columns=['Policy Start Date','Customer Feedback'], errors='ignore')
    return df

train=create_features(train)
test=create_features(test)

In [None]:
display(train.head(10))
display(train.info())
display(test.head(10))
display(test.info())

In [None]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
# Ordinal Encoding
ordinal_cols = ['Education Level', 'Exercise Frequency', 'Policy Type']
tiers = [['High School', "Bachelor's", "Master's", 'PhD'], ['Rarely', 'Monthly', 'Weekly', 'Daily'], ['Basic', 'Comprehensive', 'Premium']]
order = OrdinalEncoder(categories=tiers)
train[ordinal_cols] = order.fit_transform(train[ordinal_cols])
test[ordinal_cols] = order.transform(test[ordinal_cols])

# Nominal Encoding
nominal_cols = ['Marital Status', 'Occupation', 'Location', 'Property Type']
nominal_encoders = {}
for col in nominal_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    nominal_encoders[col] = le
    test[col] = le.transform(test[col])

In [None]:
assert isinstance(train, pd.DataFrame), "train must be a pandas DataFrame"
assert isinstance(test,  pd.DataFrame), "test must be a pandas DataFrame"
assert 'Premium Amount' in train.columns, "Target column 'Premium Amount' not found in train"

y = train['Premium Amount'].copy()              
X = train.drop(columns=['Premium Amount'])      

assert X.shape[0] == y.shape[0]

n_train = X.shape[0]
n_test  = test.shape[0]

oof_pred  = np.zeros(n_train, dtype=np.float64)
test_pred = np.zeros(n_test,  dtype=np.float64)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

print("5-fold CV with Histogram Gradient Boosting\n")
start_total = time.time()

for fold, (ti, vi) in enumerate(kf.split(X)):
    fold_start = time.time()
    print(f"Fold {fold+1}/5 training... ", end="", flush=True)

    X_tr, X_va = X.iloc[ti], X.iloc[vi]
    y_tr = np.log1p(y.iloc[ti])

    model = HistGradientBoostingRegressor(
        loss="squared_error",
        learning_rate=0.05,
        max_depth=7,
        max_iter=800,
        max_leaf_nodes=31,
        l2_regularization=2.0,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=50,
        random_state=42
    )

    model.fit(X_tr, y_tr)

    oof_pred[vi] = np.expm1(model.predict(X_va))

    test_pred += np.expm1(model.predict(test)) / kf.n_splits

    rmsle = mean_squared_log_error(y.iloc[vi], oof_pred[vi]) ** 0.5
    fold_time = time.time() - fold_start
    folds_left = kf.n_splits - fold - 1
    est_remaining = folds_left * fold_time

    print(
        f"Done | RMSLE: {rmsle:.6f} | "
        f"Time: {fold_time/60:.1f}min | "
        f"{folds_left} left â‰ˆ {est_remaining/60:.1f}min"
    )

total_time = time.time() - start_total
cv_score = mean_squared_log_error(y, oof_pred) ** 0.5

print(f"\nAll 5 folds done in {total_time/60:.1f} minutes")
print(f"FINAL 5-fold CV RMSLE: {cv_score:.6f}")

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['Premium Amount'] = test_pred
sub.to_csv('submission.csv', index=False)