<a href="https://colab.research.google.com/github/fvangool/datasharing/blob/master/Copy_of_gluon_insurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install autogluon.tabular[all]



In [None]:
!pip install dask[dataframe]



In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')


# Read the CSV file
data = pd.read_csv('/content/drive/My Drive/train.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
#drop 'id' column
data.drop("id", axis=1, inplace=True)
# log the target
data['Premium Amount'] = np.log1p(data['Premium Amount'])

In [None]:
#  Split the data into training and test sets
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures,  StandardScaler

###### predicting nans

class ImputePreviousClaims(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.model = None
        self.imputer = None
        self.predictors = ['Annual Income', 'Health Score', 'Credit Score']

    def fit(self, X, y=None):
        # Create a complete copy at the start
        X_copy = X.copy()

        # Create mask for known values
        mask = X_copy['Previous Claims'].notna()

        # Initialize and fit the imputer
        self.imputer = SimpleImputer(strategy='mean')
        X_predictors = self.imputer.fit_transform(X_copy.loc[mask, self.predictors])

        # Train the model
        self.model = RandomForestRegressor(random_state=42)
        self.model.fit(X_predictors, X_copy.loc[mask, 'Previous Claims'])

        return self

    def transform(self, X):
        # Create a complete copy
        X_copy = X.copy()

        # Impute predictors for all rows
        X_copy.loc[:, self.predictors] = self.imputer.transform(X_copy[self.predictors])

        # Predict missing values
        mask = X_copy['Previous Claims'].isna()
        if mask.any():
            missing_predictions = self.model.predict(X_copy.loc[mask, self.predictors])
            X_copy.loc[mask, 'Previous Claims'] = missing_predictions

        return X_copy

###### start feature creation

class InsuranceFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.predictors = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()

        # 1. Risk Score Components
        # Normalize credit score (300-850 scale)
        X_copy['normalized_credit_score'] = (X_copy['Credit Score'] - 300) / (850 - 300)

        # Health risk indicator
        X_copy['health_risk_score'] = (60 - X_copy['Health Score']) / 60  # Invert so higher is riskier
        X_copy['health_risk_score'] = np.where(X_copy['Smoking Status'] == 'Yes',
                                             X_copy['health_risk_score'] * 1.5,  # Penalty for smokers
                                             X_copy['health_risk_score'])

        # 2. Financial Stability Indicators
        # Income brackets (using percentiles for normalization)
        X_copy['income_bracket'] = pd.qcut(X_copy['Annual Income'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

        # Disposable income proxy (considering dependents)
        X_copy['income_per_dependent'] = X_copy['Annual Income'] / (X_copy['Number of Dependents'] + 1)

        # 3. Claims Risk Features
        # Claims frequency (claims per year of insurance)
        X_copy['claims_per_year'] = X_copy['Previous Claims'] / X_copy['Insurance Duration']

        # Binary high-risk indicator
        X_copy['high_risk_customer'] = np.where(
            (X_copy['Previous Claims'] > 2) & (X_copy['Insurance Duration'] < 3), 1, 0
        )

        # Claims history score
        X_copy['claims_history_score'] = 1 - (X_copy['Previous Claims'] / X_copy['Previous Claims'].max())

        # 4. Vehicle Risk Features
        # Vehicle age risk brackets
        X_copy['vehicle_age_risk'] = pd.cut(
            X_copy['Vehicle Age'],
            bins=[0, 3, 7, 12, float('inf')],
            labels=['New', 'Low_Risk', 'Medium_Risk', 'High_Risk']
        )

        # 5. Lifestyle Risk Score
        X_copy['lifestyle_risk'] = 0

        # Exercise frequency impact
        exercise_risk = {
            'Daily': 0,
            'Weekly': 0.25,
            'Monthly': 0.75,
            'Rarely': 1
        }
        X_copy['lifestyle_risk'] += X_copy['Exercise Frequency'].map(exercise_risk)

        # Property type risk
        property_risk = {
            'House': 1,
            'Condo': 0.75,
            'Apartment': 0.5
        }
        X_copy['lifestyle_risk'] += X_copy['Property Type'].map(property_risk)

        # Location risk
        location_risk = {
            'Urban': 1,
            'Suburban': 0.5,
            'Rural': 0.75  # Higher due to emergency response times
        }
        X_copy['lifestyle_risk'] += X_copy['Location'].map(location_risk)
        X_copy['lifestyle_risk'] /= 3  # Normalize to 0-1

        # 6. Customer Profile Features
        # Policy type risk level
        policy_risk = {
            'Premium': 3,
            'Comprehensive': 2,
            'Basic': 1
        }
        X_copy['policy_risk_level'] = X_copy['Policy Type'].map(policy_risk)

        # Customer stability score
        X_copy['customer_stability'] = (
            X_copy['Insurance Duration'] *
            X_copy['normalized_credit_score'] *
            (1 - X_copy['claims_per_year'].clip(0, 1))
        )

        # 7. Demographic Risk Score
        # Age risk brackets
        def assign_risk(age):
            if 18 <= age <= 25:
                return 'High_Risk'
            elif 25 < age <= 35:
                return 'Medium_Risk'
            elif 35 < age <= 50:
                return 'Low_Risk'
            elif 50 < age <= 65:
                return 'Medium_Risk'

        X_copy['age_risk'] = X_copy['Age'].apply(assign_risk)
        """X_copy['age_risk'] = pd.cut(
            X_copy['Age'],
            bins=[18, 25, 35, 50, 65],
            labels=['High_Risk', 'Medium_Risk', 'Low_Risk', 'Medium_Risk']
        )"""

        # Combined demographic risk
        marital_risk = {
            'Single': 0.8,
            'Married': 0.4,
            'Divorced': 0.6
        }
        X_copy['demographic_risk'] = (
            X_copy['Marital Status'].map(marital_risk) *
            (1 + (X_copy['Number of Dependents'] / 4))  # Normalize by max dependents
        )

        # 8. Education Impact
        education_level = {
            'PhD': 0.2,
            "Master's": 0.3,
            "Bachelor's": 0.4,
            'High School': 0.5
        }
        X_copy['education_risk_factor'] = X_copy['Education Level'].map(education_level)

        return X_copy

#end futurecreation

# Custom Transformer for dropping columns
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.drop(columns=self.columns, axis=1)

# Custom Transformer for logging specific columns
class LogTransform(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for column in self.columns:
            X[column] = np.log1p(X[column])
        return X

# Custom Transformer for calculating Policy Duration since Start
class PolicyDurationTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        year = pd.to_datetime(X[self.column]).dt.year
        X['Policyduration_ss'] = 2024 - year
        return X.drop(columns=[self.column])

# Custom Transformer for handling NaNs and creating binary indicators
class NanHandlingTransform(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for column in self.columns:
            indicator_column = f"{column} Present"
            X[indicator_column] = X[column].isna().astype(int)
            X[column] = X[column].fillna(-1)
        return X
#custom polynomialdegrees pipeline for selected features

class CustomFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, selected_features):
        self.selected_features = selected_features
        self.imputer = SimpleImputer(strategy='mean')
        self.poly = PolynomialFeatures(degree=2, include_bias=False)
        self.scaler = StandardScaler()
        self.poly_feature_names = None

    def fit(self, X, y=None):
        # Get selected features data
        X_selected = X[self.selected_features]

        # Fit imputer
        self.imputer.fit(X_selected)

        # Fit polynomial features
        X_imputed = self.imputer.transform(X_selected)
        X_poly = self.poly.fit_transform(X_imputed)
        self.poly_feature_names = self.poly.get_feature_names_out(self.selected_features)

        # Fit scaler
        self.scaler.fit(X_poly)

        return self

    def transform(self, X):
        # Copy input data
        X_copy = X.copy()

        # Transform selected features
        X_selected = X_copy[self.selected_features]

        # Apply imputation
        X_imputed = self.imputer.transform(X_selected)

        # Generate polynomial features
        X_poly = self.poly.transform(X_imputed)

        # Scale features
        X_scaled = self.scaler.transform(X_poly)

        # Create DataFrame with transformed features
        X_transformed = pd.DataFrame(
            X_scaled,
            columns=self.poly_feature_names,
            index=X.index
        )

        # Replace original features with transformed ones
        for col in self.selected_features:
            if col in X_copy.columns:
                X_copy.drop(col, axis=1, inplace=True)

        # Combine transformed features with remaining features
        result = pd.concat([X_transformed, X_copy], axis=1)

        return result



# Define the pipeline
pipeline = Pipeline([
    #('drop_columns', DropColumns(columns=['id'])),
    #('impute_claims', ImputePreviousClaims()),
    ("feature_creation", InsuranceFeatureTransformer()),
    ('log_transform', LogTransform(columns=['Annual Income'])),
    #('log_transform', LogTransform(columns=['Premium Amount', 'Annual Income'])),
    ('policy_duration', PolicyDurationTransform(column='Policy Start Date')),
    #('nan_handling', NanHandlingTransform(columns=['Previous Claims', 'Occupation']))
    ('nan_handling', NanHandlingTransform(columns=['Occupation'])),
    ('feature_transformer', CustomFeatureTransformer(selected_features=['Annual Income', 'Health Score','Credit Score', 'normalized_credit_score'])),
    #('polynomial_features', PolynomialFeatures(degree=2, include_bias=False)),#, ['Annual Income', 'Health Score','Credit Score', 'normalized_credit_score'])
    #('nan_handling', SimpleImputer(strategy='mean'))
])



In [None]:
from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")
transformed_train = pipeline.fit_transform(train_data)
from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")
transformed_test = pipeline.transform(test_data)
from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")

Current Timestamp: 2024-12-30 11:26:13.615734
Current Timestamp: 2024-12-30 11:26:26.309927
Current Timestamp: 2024-12-30 11:26:28.599706


In [None]:
from autogluon.tabular import TabularPredictor
from autogluon.core.metrics import make_scorer
from sklearn.metrics import root_mean_squared_log_error

import pandas as pd

## create custom scorer metric

def root_mean_squared_log_error_cust(y_true, y_pred):
    """
    Calculate the root mean squared log error.

    Parameters:
        y_true (array-like): True target values.
        y_pred (array-like): Predicted target values.

    Returns:
        float: RMSLE value.
    """
    #y_true = np.maximum(0, y_true)  # Ensure no negative values
    #y_pred = np.maximum(0, y_pred)  # Ensure no negative values
    #return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

    return root_mean_squared_log_error(y_true, y_pred)

#  AutoGluon Scorer
ag_rmsle_scorer = make_scorer(name='root_mean_squared_log_error',
                              score_func=root_mean_squared_log_error,
                              optimum=0,  # The best value of RMSLE is 0
                              greater_is_better=False)  # Lower is better for RMSLE

In [None]:
from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")


from autogluon.tabular import TabularPredictor
#set path for model save
save_path = '/kaggle/working/'
# Define target column and split into train/test
target = 'Premium Amount'
#set runtime limit
time_limit=1800#10800




# Initialize and train AutoGluon
# Initialize and train AutoGluon
predictor = TabularPredictor(label=target, path=save_path, eval_metric=ag_rmsle_scorer, problem_type='regression').fit(transformed_train,time_limit=time_limit,
    presets='medium_quality' ) #time * 16 for best quality

leaderboard = predictor.leaderboard(data=None, extra_info=True)

# Print the leaderboard
print(leaderboard)

# Evaluate on test data
performance = predictor.evaluate(transformed_test)
print("test performance",performance)

from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       7.35 GB / 12.67 GB (58.0%)
Disk Space Avail:   73.14 GB / 107.72 GB (67.9%)
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 1800s
AutoGluon will save models to "/kaggle/working"
Train Data Rows:    960000
Train Data Columns: 44
Label Column:       Premium Amount
Problem Type:       regression
Preprocessing data ...


Current Timestamp: 2024-12-30 11:26:28.678278


Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    8376.16 MB
	Train Data (Original)  Memory Usage: 855.82 MB (10.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
		Fitting DatetimeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Unused Original Features (Count: 1): ['Occupation']
		These features were not used to generate any of the output features. Add a 

In [None]:
from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")


y_pred = predictor.predict(transformed_test)
perf = predictor.evaluate_predictions(y_true=transformed_test[target], y_pred=y_pred, auxiliary_metrics=True)

from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")

y_pred_train = predictor.predict(transformed_train)
perf_train = predictor.evaluate_predictions(y_true=transformed_train[target], y_pred=y_pred_train, auxiliary_metrics=True)

#set to rmsle
from sklearn.metrics import mean_squared_log_error

print("ypred", y_pred.shape)
print("test_data[target]", test_data[target].shape)
print("ypred train", y_pred_train.shape)
print("train_data[target]", train_data[target].shape)

test_msle_pred = mean_squared_log_error(test_data[target], y_pred)
train_msle_pred = mean_squared_log_error(train_data[target], y_pred_train)
test_rmsle_pred = np.sqrt(test_msle_pred)
train_rmsle_pred = np.sqrt(train_msle_pred)
print("rmsle prediction on test" , test_rmsle_pred)
print("rmsle prediction on train", train_rmsle_pred)


In [None]:
testos = pd.read_csv("/kaggle/input/playground-series-s4e12/test.csv")
id = testos['id']
testos.drop("id", axis=1, inplace=True)
testy = pipeline.transform(testos)

from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")

y_testset_pred = predictor.predict(testy)
# Convert H2OFrame to numpy array (get only the predictions column)
#y_testset_pred = y_testset_pred['predict'].as_data_frame().values
y_testset_pred = np.expm1(y_testset_pred.values)
premium = pd.DataFrame(y_testset_pred, columns=['Premium Amount'])
premium_round = round(premium['Premium Amount'] , 3)
submission = pd.concat([id, premium_round], axis=1)
print(submission.columns)
print(submission.head())
#submission.to_csv('submission.csv', index=False, sep=',')
#submission.to_csv('/kaggle/working/submission.csv', index=False, sep=',')
print("submission file updated")
from datetime import datetime
current_timestamp = datetime.now()
print(f"Current Timestamp: {current_timestamp}")

In [None]:
from autogluon.tabular import TabularPredictor

# Load the predictor from the path where it was saved
loaded_predictor = TabularPredictor.load("/kaggle/working/")

In [None]:
# Get feature importance
feature_importances = predictor.feature_importance(data=transformed_test)
print(feature_importances)

# Visualize feature importance (if available)
predictor.plot_feature_importance(data=transformed_test)

In [None]:
leaderboard = predictor.leaderboard()
print(leaderboard)

In [None]:

predictor.model_names()
print(transformed_train.columns)
print(test_data.columns)