<a href="https://colab.research.google.com/github/fabriziobasso/Colab_backup/blob/main/s04e12_insurance_regression_simple_dl_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Phase 1: Loading Model, Preprocessing

In [None]:
# Importing necessary libraries for data manipulation and machine learning
import pandas as pd
import numpy as np

# Importing preprocessing and pipeline tools from scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Importing deep learning tools from TensorFlow/Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Suppressing warnings to keep the output clean
import warnings
warnings.filterwarnings('ignore')  # Disable warnings during execution


In [None]:
class Constant_Var:
    # Defining file paths for the dataset and sample submission file
    train_path = "/kaggle/input/playground-series-s4e12/train.csv"
    test_path = "/kaggle/input/playground-series-s4e12/test.csv"
    sample_sub_path = "/kaggle/input/playground-series-s4e12/sample_submission.csv"

    # Defining the target variable name
    target = 'Premium Amount'

In [None]:
def get_data():
    # Read the train and test data
    train = pd.read_csv(Constant_Var.train_path, index_col='id')
    test = pd.read_csv(Constant_Var.test_path, index_col='id')

    # Drop the 'Policy Start Date' column from both training and testing data
    train.drop('Policy Start Date', axis=1, inplace=True)
    test.drop('Policy Start Date', axis=1, inplace=True)

    # Select categorical and numerical columns
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    # Fix by using remove() to exclude Constant_Var.target from the list
    numerical_columns = train.select_dtypes(exclude=['object']).columns.tolist()
    numerical_columns.remove(Constant_Var.target) # Removes 'Constant_Var.target' from the list

    # Drop the target variable from the feature data
    X = train.drop(Constant_Var.target, axis=1)
    y = train[Constant_Var.target]
    X_test = test.copy()

    # Preprocessing for categorical features: Imputation + OneHotEncoding
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Preprocessing for numerical features: Imputation + Scaling
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Combine both transformations
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns)
        ],
        remainder='drop', # Drop any columns not specified in transformers
        verbose_feature_names_out=True
    )

    # Fit and transform the training data
    X_preprocessed = preprocessor.fit_transform(X)
    feature_names = preprocessor.get_feature_names_out()

    # Create DataFrame for preprocessed training features
    X_preprocessed_df = pd.DataFrame(data=X_preprocessed, columns=feature_names)

    # Transform the test data using the fitted preprocessor
    X_test_preprocessed = preprocessor.transform(X_test)
    X_test_preprocessed_df = pd.DataFrame(data=X_test_preprocessed, columns=feature_names)

    return X_preprocessed_df, y, X_test_preprocessed_df

# Phase 2: Building and Optimizing the Neural Network for Regression

In [None]:
# Define the RMSLE loss function
def rmsle_1(y_true, y_pred):
    # Add a small constant (1.0) to avoid log(0), which is undefined
    y_true_log = tf.math.log(y_true + 1.0)
    y_pred_log = tf.math.log(y_pred + 1.0)

    # Compute the RMSLE
    return tf.sqrt(tf.reduce_mean(tf.square(y_true_log - y_pred_log)))

# Define the RMSLE loss function
def rmsle_2(y_true, y_pred):
    # Add a small constant to avoid log(0) which is undefined
    return tf.sqrt(tf.reduce_mean(tf.square(tf.math.log(y_true + 1.0) - tf.math.log(y_pred + 1.0))))


# Build the model
def build_model(input_dim):
    model = Sequential([
        Dense(64, input_dim=input_dim, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1)  # Output layer for regression (no activation function)
    ])

    # Compile the model with RMSLE as the loss function
    model.compile(optimizer=Adam(learning_rate=0.001), loss=rmsle_1)

    return model

# Prepare and train the model
def train_model(X_train, y_train):
    input_dim = X_train.shape[1]  # Number of features
    model = build_model(input_dim)

    # Early stopping to avoid overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Fit the model
    model.fit(X_train, y_train, epochs=100, batch_size=512, validation_split=0.2, callbacks=[early_stopping], verbose=1)

    return model

# Phase 3: Predictions on the Test Data

In [None]:
# Step 1: Generate predictions from the trained model
def make_predictions(model, X_test):
    predictions = model.predict(X_test)
    return predictions

# Step 2: Prepare the final submission DataFrame
def create_submission(test, predictions):
    submission_df = pd.DataFrame({
        'id': test.index + 1200000,  # Use the 'id' column from the test data
        'Premium Amount': predictions.flatten()  # Flatten the predictions array to make it 1D
    })

    # Step 3: Save the predictions to a CSV file for submission
    submission_df.to_csv('submission.csv', index=False)

    # Notify the user that the submission file has been successfully created
    print("Submission file 'submission.csv' created successfully.")

In [None]:
# Get the data
X_train, y_train, X_test = get_data()

In [None]:
# Train the model
model = train_model(X_train, y_train)

# Make predictions
predictions = make_predictions(model, X_test)

In [None]:
# Create and save the submission file
create_submission(X_test, predictions)