In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import datetime
from datetime import datetime

In [53]:
# Get current date in YYYY-MM-DD format
current_date = datetime.now().strftime('%Y-%m-%d')


In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
df_train.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [6]:
df_test.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [23]:
df_train.isnull().sum()

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

In [24]:
categorical_columns = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
for categorical_column in categorical_columns : 
    print(df_train[categorical_column].value_counts())
    print("------------------------------------------------------")

Gender
Male      602571
Female    597429
Name: count, dtype: int64
------------------------------------------------------
Marital Status
Single      395391
Married     394316
Divorced    391764
Name: count, dtype: int64
------------------------------------------------------
Education Level
Master's       303818
PhD            303507
Bachelor's     303234
High School    289441
Name: count, dtype: int64
------------------------------------------------------
Occupation
Employed         282750
Self-Employed    282645
Unemployed       276530
Name: count, dtype: int64
------------------------------------------------------
Location
Suburban    401542
Rural       400947
Urban       397511
Name: count, dtype: int64
------------------------------------------------------
Policy Type
Premium          401846
Comprehensive    399600
Basic            398554
Name: count, dtype: int64
------------------------------------------------------
Policy Start Date
2020-02-08 15:21:39.134960    142
2023-08-13 1

In [27]:
#Initialize transformers
cat_imputer = SimpleImputer(strategy='most_frequent')
num_imputer = SimpleImputer(strategy='median')
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
scaler = StandardScaler()

In [16]:
def process_date(df):
    df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'])
    df['Policy_Year'] = df['Policy Start Date'].dt.year
    df['Policy_Month'] = df['Policy Start Date'].dt.month
    df['Policy_Day'] = df['Policy Start Date'].dt.day
    return df.drop('Policy Start Date', axis=1)

In [28]:
def process_categorical(df, is_training):
    categorical_columns = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 
                         'Location', 'Policy Type', 'Customer Feedback', 
                         'Smoking Status', 'Exercise Frequency', 'Property Type']

    # Impute missing values
    if is_training:
        df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])
        encoded_values = encoder.fit_transform(df[categorical_columns])
    else:
        df[categorical_columns] = cat_imputer.transform(df[categorical_columns])
        encoded_values = encoder.transform(df[categorical_columns])

    # Create encoded DataFrame
    encoded_feature_names = encoder.get_feature_names_out(categorical_columns)
    encoded_df = pd.DataFrame(encoded_values, columns=encoded_feature_names, index=df.index)

    # Drop original categorical columns and concatenate encoded columns
    df = pd.concat([df.drop(categorical_columns, axis=1), encoded_df], axis=1)

    return df

In [29]:
def process_numerical(df, is_training):
    numerical_columns = ['Age', 'Annual Income', 'Number of Dependents', 
                        'Health Score', 'Previous Claims', 'Vehicle Age',
                        'Credit Score', 'Insurance Duration']

    if is_training:
        df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = num_imputer.transform(df[numerical_columns])
    return df


In [30]:
def scale_features(df, is_training):
    numerical_columns = ['Age', 'Annual Income', 'Number of Dependents', 
                        'Health Score', 'Previous Claims', 'Vehicle Age',
                        'Credit Score', 'Insurance Duration']

    if is_training:
        df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    else:
        df[numerical_columns] = scaler.transform(df[numerical_columns])
    return df

In [31]:
def prepare_data(df, is_training=True):
    df = process_date(df)
    df = process_numerical(df, is_training)
    df = scale_features(df, is_training)
    df = process_categorical(df, is_training)
    return df

In [32]:
df_train_processed = prepare_data(df_train.copy(), is_training=True)
df_test_processed = prepare_data(df_test.copy(), is_training=False)

In [33]:
df_train_processed.isnull().sum()

id                             0
Age                            0
Annual Income                  0
Number of Dependents           0
Health Score                   0
Previous Claims                0
Vehicle Age                    0
Credit Score                   0
Insurance Duration             0
Premium Amount                 0
Policy_Year                    0
Policy_Month                   0
Policy_Day                     0
Gender_Female                  0
Gender_Male                    0
Marital Status_Divorced        0
Marital Status_Married         0
Marital Status_Single          0
Education Level_Bachelor's     0
Education Level_High School    0
Education Level_Master's       0
Education Level_PhD            0
Occupation_Employed            0
Occupation_Self-Employed       0
Occupation_Unemployed          0
Location_Rural                 0
Location_Suburban              0
Location_Urban                 0
Policy Type_Basic              0
Policy Type_Comprehensive      0
Policy Typ

In [34]:
df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 43 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   id                           1200000 non-null  int64  
 1   Age                          1200000 non-null  float64
 2   Annual Income                1200000 non-null  float64
 3   Number of Dependents         1200000 non-null  float64
 4   Health Score                 1200000 non-null  float64
 5   Previous Claims              1200000 non-null  float64
 6   Vehicle Age                  1200000 non-null  float64
 7   Credit Score                 1200000 non-null  float64
 8   Insurance Duration           1200000 non-null  float64
 9   Premium Amount               1200000 non-null  float64
 10  Policy_Year                  1200000 non-null  int32  
 11  Policy_Month                 1200000 non-null  int32  
 12  Policy_Day                   1200000 non-n

In [35]:
df_test_processed.isnull().sum()

id                             0
Age                            0
Annual Income                  0
Number of Dependents           0
Health Score                   0
Previous Claims                0
Vehicle Age                    0
Credit Score                   0
Insurance Duration             0
Policy_Year                    0
Policy_Month                   0
Policy_Day                     0
Gender_Female                  0
Gender_Male                    0
Marital Status_Divorced        0
Marital Status_Married         0
Marital Status_Single          0
Education Level_Bachelor's     0
Education Level_High School    0
Education Level_Master's       0
Education Level_PhD            0
Occupation_Employed            0
Occupation_Self-Employed       0
Occupation_Unemployed          0
Location_Rural                 0
Location_Suburban              0
Location_Urban                 0
Policy Type_Basic              0
Policy Type_Comprehensive      0
Policy Type_Premium            0
Customer F

In [36]:
df_test_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           800000 non-null  int64  
 1   Age                          800000 non-null  float64
 2   Annual Income                800000 non-null  float64
 3   Number of Dependents         800000 non-null  float64
 4   Health Score                 800000 non-null  float64
 5   Previous Claims              800000 non-null  float64
 6   Vehicle Age                  800000 non-null  float64
 7   Credit Score                 800000 non-null  float64
 8   Insurance Duration           800000 non-null  float64
 9   Policy_Year                  800000 non-null  int32  
 10  Policy_Month                 800000 non-null  int32  
 11  Policy_Day                   800000 non-null  int32  
 12  Gender_Female                800000 non-null  float64
 13 

In [37]:
correlation_matrix_train = df_train_processed.corr()
print(correlation_matrix_train)

                                   id       Age  Annual Income  \
id                           1.000000 -0.000134      -0.000864   
Age                         -0.000134  1.000000       0.000061   
Annual Income               -0.000864  0.000061       1.000000   
Number of Dependents         0.000717  0.001399       0.001920   
Health Score                 0.001340  0.000813       0.023301   
Previous Claims             -0.000135  0.001544       0.034777   
Vehicle Age                 -0.001461 -0.002436      -0.000438   
Credit Score                 0.000909  0.002661      -0.181180   
Insurance Duration          -0.000350 -0.000062       0.000383   
Premium Amount              -0.000292 -0.002410      -0.009989   
Policy_Year                 -0.000912 -0.003151      -0.009716   
Policy_Month                 0.000621  0.001196       0.008825   
Policy_Day                  -0.000692 -0.000596       0.000346   
Gender_Female                0.001454 -0.000215       0.000983   
Gender_Mal

In [38]:
print(correlation_matrix_train["Premium Amount"])

id                            -0.000292
Age                           -0.002410
Annual Income                 -0.009989
Number of Dependents          -0.000947
Health Score                   0.013976
Previous Claims                0.039394
Vehicle Age                    0.000391
Credit Score                  -0.024471
Insurance Duration            -0.000028
Premium Amount                 1.000000
Policy_Year                   -0.011084
Policy_Month                   0.006702
Policy_Day                     0.000291
Gender_Female                 -0.000161
Gender_Male                    0.000161
Marital Status_Divorced       -0.001545
Marital Status_Married        -0.002184
Marital Status_Single          0.003682
Education Level_Bachelor's     0.000103
Education Level_High School    0.001462
Education Level_Master's      -0.000290
Education Level_PhD           -0.001252
Occupation_Employed           -0.003727
Occupation_Self-Employed       0.002608
Occupation_Unemployed          0.001787


In [39]:
train_data = df_train_processed.drop('id', axis=1)
X = train_data.drop('Premium Amount', axis=1)
y = np.log1p(train_data['Premium Amount'])  # log transform the target

In [40]:
# Save ID column from test data and remove it
test_data = df_test_processed
id_test = test_data['id']
test_data = test_data.drop('id', axis=1)

In [None]:
### Train

In [42]:
# Cell 10: Train XGBoost model
import xgboost as xgb

In [43]:
# Define best parameters
hyperparameters = {
    'colsample_bytree': 0.9,
    'learning_rate': 0.01,
    'max_depth': 7,
    'n_estimators': 500,
    'subsample': 0.8
}


In [44]:
# Create and train the model
model = xgb.XGBRegressor(**hyperparameters)
model.fit(X, y)

In [54]:
# #You can also add early stopping and validation:
# # With early stopping
# from sklearn.model_selection import train_test_split

# # Split training data into train and validation
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# model = xgb.XGBRegressor(**hyperparameters)
# model.fit(
#     X_train, y_train,
#     eval_set=[(X_val, y_val)],
#     early_stopping_rounds=50,
#     verbose=100
# )   

NameError: name 'best_params' is not defined

In [None]:
# Cell 11: Make predictions
# Predict on test data
test_predictions = model.predict(test_data)
test_predictions = np.expm1(test_predictions)  # Transform predictions back to original scale


In [46]:
# Create submission DataFrame
submission = pd.DataFrame({
    'id': id_test,
    'Premium Amount': test_predictions
})

In [None]:
# Save processed data and predictions
train_data.to_csv(f'data-out/train-{current_date}.csv', index=False)
test_data.to_csv(f'data-out/test-{current_date}.csv', index=False)


In [52]:
submission.to_csv(f'data-out/predictions-{current_date}.csv', index=False)
print(f"Saved submission to: data-out/predictions-{current_date}.csv")

Saved submission to: data-out/predictions-2024-12-16.csv


In [48]:
# Cell 12: Print shapes and sample predictions
print("Training set shape:", X.shape)
print("Test set shape:", test_data.shape)
print("\nSample predictions:")
print(submission.head())

Training set shape: (1200000, 41)
Test set shape: (800000, 41)

Sample predictions:
        id  Premium Amount
0  1200000      687.480042
1  1200001      800.656982
2  1200002      799.783997
3  1200003      793.468872
4  1200004      760.409180


In [49]:
# Optional: Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importance.head(10))


Top 10 most important features:
                      feature  importance
4             Previous Claims    0.193798
1               Annual Income    0.134083
6                Credit Score    0.119557
8                 Policy_Year    0.092602
3                Health Score    0.087527
29  Customer Feedback_Average    0.077417
15      Marital Status_Single    0.016435
30     Customer Feedback_Good    0.013206
31     Customer Feedback_Poor    0.013115
9                Policy_Month    0.010022


In [50]:
# Cell 13: Cross-validation (optional)
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)

print("\nCross-validation RMSE scores:", rmse_scores)
print("Mean RMSE: {:.4f} (+/- {:.4f})".format(rmse_scores.mean(), rmse_scores.std() * 2))


Cross-validation RMSE scores: [1.05317318 1.05482599 1.05062587 1.05495628 1.04843616]
Mean RMSE: 1.0524 (+/- 0.0050)


In [51]:
# Print detailed cross-validation results
cv_scores = np.sqrt(-cv_scores)  # Convert back to RMSE
print("\nCross-validation Results:")
print("Individual fold RMSE scores:")
for i, score in enumerate(cv_scores, 1):
    print(f"Fold {i}: {score:.4f}")

print(f"\nMean RMSE: {cv_scores.mean():.4f}")
print(f"Standard deviation: {cv_scores.std():.4f}")
print(f"95% Confidence Interval: {cv_scores.mean():.4f} ± {cv_scores.std()*2:.4f}")

# Calculate relative error percentage
relative_error = (cv_scores.std() * 2 / cv_scores.mean()) * 100
print(f"Relative error: {relative_error:.2f}%")


Cross-validation Results:
Individual fold RMSE scores:
Fold 1: 1.0532
Fold 2: 1.0548
Fold 3: 1.0506
Fold 4: 1.0550
Fold 5: 1.0484

Mean RMSE: 1.0524
Standard deviation: 0.0025
95% Confidence Interval: 1.0524 ± 0.0050
Relative error: 0.48%


In [None]:
# Your results show:

# Very consistent RMSE across all 5 folds (all around 1.05)
# Small standard deviation (0.0050) indicates stable model performance
# Mean RMSE of 1.0524 with 95% confidence interval of ±0.0050
# The model's performance varies by less than 0.5% between folds
# This suggests:

# Your model is very stable (low variance between folds)
# The performance is consistent across different subsets of data
# There's no indication of overfitting (scores are similar across folds)
# Remember: Since we used log1p transformation on the target variable, these RMSE scores are in log scale. To get the actual currency scale errors, you would need to transform back using expm1.

In [None]:
# # Cell 14: Save model (optional)
# import joblib

# # Save model
# joblib.dump(model, 'data-out/xgboost_model.joblib')

# # Save feature names (useful for future predictions)
# feature_names = {
#     'feature_names': list(X.columns)
# }
# joblib.dump(feature_names, 'data-out/feature_names.joblib')