# Risk Scoring & Underwriting
Risk Scoring and Underwriting are critical components in the financial and insurance industries, particularly in the evaluation and management of risk associated with lending, insurance policies, or other financial products.
## 1. Data Integration & Exploration

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Define a class
class ARDPreprocessor:
    """
    Automated Risk Data (ARD) Preprocessing Module
    Handles missing values, encodes categorical variables, and scales numerical features.
    """
    # Constructor method to initialize the class
    def __init__(self, num_strategy='median', cat_strategy='most_frequent', scaler=StandardScaler(), encoder=OneHotEncoder(handle_unknown='ignore')):
        self.num_imputer = SimpleImputer(strategy=num_strategy)
        self.cat_imputer = SimpleImputer(strategy=cat_strategy)
        self.scaler = scaler
        self.encoder = encoder
        self.preprocessor = None

        # Configure logging to display messages at the INFO level and above
        logging.basicConfig(level=logging.INFO)

    # Define a fit method to fit the preprocessing pipeline to the input data
    def fit(self, df, numeric_features, categorical_features):
        # Begin a try block to handle potential errors during fitting
        try:
            # Create a Pipeline for numeric features
            numeric_pipeline = Pipeline([
                ('imputer', self.num_imputer),
                ('scaler', self.scaler)
            ])
            # Create a Pipeline for categorical features
            categorical_pipeline = Pipeline([
                ('imputer', self.cat_imputer),
                ('encoder', self.encoder)
            ])
            # Combine the numeric and categorical pipelines into a single preprocessing object
            self.preprocessor = ColumnTransformer([
                ('num', numeric_pipeline, numeric_features),
                ('cat', categorical_pipeline, categorical_features)
            ])
            # Fit the ColumnTransformer to the input DataFrame df
            self.preprocessor.fit(df)
            logging.info("Preprocessor fitted successfully.")
        except Exception as e:
            logging.error(f"Error during fitting: {e}")
            raise

    # Apply the fitted preprocessing pipeline to transform the input data
    def transform(self, df):
        try:
            transformed_data = self.preprocessor.transform(df)
            feature_names = self.preprocessor.get_feature_names_out()
            return pd.DataFrame(transformed_data, columns=feature_names)
        except Exception as e:
            logging.error(f"Error during transformation: {e}")
            raise

    # Combine fitting and transformation into a single method
    def fit_transform(self, df, numeric_features, categorical_features):
        self.fit(df, numeric_features, categorical_features)
        return self.transform(df)


The `ARDPreprocessor` class is a modular and reusable preprocessing module.

- It handles missing values, scales numeric features, and encodes categorical features.

- It uses `Pipeline` and `ColumnTransformer` for a clean and consistent workflow.

- It includes error handling and logging for robustness.

- The `fit_transform` method provides a convenient way to fit and transform data in one step.

Here's a breakdown of the steps:

1. Fitting:

- The `SimpleImputer` learns the mean (or other specified statistic) of the numeric features.

- The `StandardScaler` computes the mean and standard deviation of the numeric features.

2. Transformation:

- The `SimpleImputer` replaces missing values with the learned mean.

- The `StandardScaler` standardizes the features using the computed mean and standard deviation.