# Wide and Deep Networks for Credit Score Classification

By: Joe, Sellett, Haiyan Cai, and Cole Wagner

In [20]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import (
    Activation,
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
)
from tensorflow.keras.utils import FeatureSpace


In [21]:
credit_df = pd.read_csv("credit_score_cleaned.csv")

In [22]:
credit_df["payment_behaviour"].value_counts()

payment_behaviour
Low_spent_Small_value_payments      26503
High_spent_Medium_value_payments    18431
Low_spent_Medium_value_payments     14516
High_spent_Large_value_payments     14438
High_spent_Small_value_payments     11850
Low_spent_Large_value_payments      10958
Name: count, dtype: int64

## Data Preparation

### Drop Unnecessary Columns

Before proceeding with the modeling phase of this project, we will remove the following variables: customer_id, name, ssn, and type_of_loan. The customer_id field is being excluded because we already have a more robust unique identifier, id, which will serve as our primary reference for credit score reports. Similarly, name and ssn offer no predictive value and are being dropped to maintain data privacy and reduce dimensionality. Each of these variables contains approximately 8,000–12,000 unique values, whereas id contains over 96,000. Lastly, type_of_loan is being excluded for its high number of categories (50+), which would introduce unnecessary complexity. Instead, we will rely on the credit_mix variable, which summarizes loan diversity in a more manageable form as a category with only 3 unique values (standard, good, and bad).

In [23]:
credit_df = credit_df.drop(
    columns=["customer_id", "name", "ssn", "type_of_loan"]
)

### Create Feature Space for Preprocessing

In [24]:
def create_dataset_from_dataframe(
    x_input: pd.DataFrame, y_input: pd.Series, batch_size: int
) -> tf.data.Dataset:
    """Convert a pandas dataframe to a TensorFlow Dataset.

    Parameters
    ----------
    x_input : pd.DataFrame
        The input pandas dataframe containing the features.
    y_input : pd.Series
        The input pandas series containing the labels.
    batch_size : int
        The number of rows per batch in the TensorFlow Dataset.

    Returns
    -------
    tf.data.Dataset
        A TensorFlow Dataset object created from the input dataframe.

    """
    df_dict = {
        key: value.to_numpy()[:, np.newaxis]
        for key, value in x_input.items()
    }

    tf_ds = tf.data.Dataset.from_tensor_slices((dict(df_dict), y_input))
    tf_ds = tf_ds.batch(batch_size)
    return tf_ds.prefetch(batch_size)


In [25]:
# Sample schema based on the dataframe info
categorical_features = [
    "month",
    "occupation",
    "credit_mix",
    "payment_of_min_amount",
    "payment_behaviour",
]
numeric_features = [
    "age",
    "annual_income",
    "monthly_inhand_salary",
    "credit_history_age",
    "total_emi_per_month",
    "num_bank_accounts",
    "num_credit_card",
    "interest_rate",
    "num_of_loan",
    "delay_from_due_date",
    "num_of_delayed_payment",
    "changed_credit_limit",
    "num_credit_inquiries",
    "outstanding_debt",
    "credit_utilization_ratio",
    "amount_invested_monthly",
    "monthly_balance",
]

# Define feature configs
feature_space = FeatureSpace(
    features={
        **{
            name: FeatureSpace.string_categorical(num_oov_indices=0)
            for name in categorical_features
        },
        **{
            name: FeatureSpace.float_normalized()
            for name in numeric_features
        },
    },
    crosses=[
        FeatureSpace.cross(
            feature_names=("occupation", "credit_mix"),
            crossing_dim=15 * 3,
        ),
        FeatureSpace.cross(
            feature_names=("payment_of_min_amount", "payment_behaviour"),
            crossing_dim=6 * 2,
        ),
    ],
    output_mode="concat",
)

### Cross-Product Feature Justification

First, we created a cross-product feature between `occupation` and `credit_mix`. This combination allows us to capture differences in credit behavior across various professional backgrounds. For example, a neurosurgeon with a bad credit mix may exhibit very different financial behavior compared to an unemployed individual with the same credit mix. While each variable on its own may offer limited insight, their combination provides a more nuanced understanding of how occupation and credit diversity interact.

Another cross-product feature we created combines `payment_of_min_amount` and `payment_behavior`. The `payment_of_min_amount` variable is a binary indicator showing whether an individual made only the minimum payment on their debt for that month. In contrast, `payment_behavior` provides a broader description of a person’s spending and repayment patterns, such as “low spent, high payments” or “high spent, medium payments.” Since these two variables are closely related, their combination may help the model better capture nuanced repayment behaviors and improve its ability to distinguish between risk profiles.

### Performance Metric Justification

Given the nature of our project, it’s important to evaluate our model using multiple metrics rather than relying solely on accuracy. In credit risk classification, false predictions carry different levels of business risk. For example, if a high-risk individual is incorrectly classified as low-risk, the company may absorb the financial loss from a bad loan. This makes recall especially important, as it tells us how well the model identifies actual high-risk cases and helps minimize false negatives. At the same time, precision matters because it reflects how accurate our high-risk predictions are, which ensures we don’t wrongly classify low-risk individuals as high-risk. A high recall means we’re catching most of the truly risky borrowers, while a high precision score means we’re correctly labeling them. Since both metrics are critical and often trade off against each other, we focus on the F1 score, which represents the harmonic mean of precision and recall. The F1 score gives us a more balanced and realistic measure of performance, especially in a setting where both catching risky borrowers and avoiding false alarms are essential to the business.

### Data Splitting

We have chosen to use a standard 80/20 train-test split for dividing our dataset. Given the size of our data (approximately 100,000 observations) we believe this approach is justified and will provide a reliable estimate of model performance. If our dataset were significantly smaller (around 1,000 observations), we might opt for 10-fold cross-validation to obtain a more stable and generalized result. Additionally, the 80/20 split offers a clear advantage in terms of computational efficiency. While 10-fold cross-validation could yield a marginal improvement in performance estimates, it would come at a considerable computational cost that is unnecessary given the scale of our data.

In [26]:
x_train, x_test, y_train, y_test = train_test_split(
    credit_df.drop(columns=["credit_score", "id"]),
    credit_df["credit_score"],
    test_size=0.2,
    random_state=7324,
    stratify=credit_df["credit_score"],
)

In [27]:
# Convert data to TensorFlow Datasets
train_ds = create_dataset_from_dataframe(x_train, y_train, batch_size=32)
test_ds = create_dataset_from_dataframe(x_test, y_test, batch_size=32)

In [28]:
# Apply feature space to datasets
train_ds_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_no_labels)
processed_train_ds = train_ds.map(
    lambda x, y: (feature_space(x), y),
    num_parallel_calls=tf.data.AUTOTUNE,
)
processed_train_ds = processed_train_ds.prefetch(tf.data.AUTOTUNE)

test_ds_no_labels = test_ds.map(lambda x, _: x)
processed_test_ds = test_ds.map(
    lambda x, y: (feature_space(x), y),
    num_parallel_calls=tf.data.AUTOTUNE,
)
processed_test_ds = processed_test_ds.prefetch(tf.data.AUTOTUNE)


2025-04-12 16:23:51.462825: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


# Modeling

In [29]:
from tensorflow.keras.layers import Embedding, Flatten


def setup_embedding_from_categorical(feature_space, col_name):
    # what the maximum integer value for this variable?
    # which is the same as the number of categories
    N = len(feature_space.preprocessors[col_name].get_vocabulary())

    # get the output from the feature space, which is input to embedding
    x = feature_space.preprocessors[col_name].output

    # now use an embedding to deal with integers from feature space
    x = Embedding(
        input_dim=N,
        output_dim=int(np.sqrt(N)),
        input_length=1,
        name=col_name + "_embed",
    )(x)

    return Flatten()(x)


In [30]:
def setup_embedding_from_crossing(feature_space, col_name):
    # what the maximum integer value for this variable?

    # get the size of the feature
    N = feature_space.crossers[col_name].num_bins
    x = feature_space.crossers[col_name].output

    # now use an embedding to deal with integers as if they were one hot encoded
    x = Embedding(
        input_dim=N,
        output_dim=int(np.sqrt(N)),
        input_length=1,
        name=col_name + "_embed",
    )(x)

    return Flatten()(x)


In [31]:
def setup_embedding_from_encoding(encoded_features, col_name):
    # what the maximum integer value for this variable?

    # get the size of the feature
    x = encoded_features[col_name]
    N = x.shape[1]

    # now use an embedding to deal with integers as if they were one hot encoded
    x = Embedding(
        input_dim=N,
        output_dim=int(np.sqrt(N)),
        input_length=1,
        name=col_name + "_embed",
    )(x)

    x = Flatten()(
        x
    )  # get rid of that pesky extra dimension (for time of embedding)

    return x

In [None]:
def build_wide_branches(dict_inputs, crossed_columns):
    encoded_features = feature_space.get_encoded_features()

    # we need to create separate lists for each branch
    crossed_outputs = []

    # for each crossed variable, make an embedding
    for col in feature_space.crossers:
        x = setup_embedding_from_encoding(encoded_features, col)

        # save these outputs in list to concatenate later
        crossed_outputs.append(x)

    # now concatenate the outputs and add a fully connected layer
    return Concatenate(name="wide_concat")(crossed_outputs)


In [33]:
def build_deep_branches(encoded_features, hidden_units=[64, 32]):
    # reset this input branch
    all_deep_branch_outputs = []

    # for each numeric variable, just add it in after embedding
    for col in numeric_features:
        x = encoded_features[col]
        # x = tf.cast(x,float) # cast an integer as a float here
        all_deep_branch_outputs.append(x)

    # for each categorical variable
    for col in categorical_features:
        # get the output tensor from ebedding layer
        x = setup_embedding_from_encoding(encoded_features, col)

        # save these outputs in list to concatenate later
        all_deep_branch_outputs.append(x)
    deep_branch = Concatenate(name="embed_concat")(all_deep_branch_outputs)
    i = 0
    for layers in hidden_units:
        name = "deep" + i
        deep_branch = Dense(unit=layers, activation="relu", name=name)(
            deep_branch
        )

    return deep_branch

In [34]:
from tensorflow.keras.utils import plot_model


def build_wide_deep_networks(crossed_columns, hidden_units=[64, 32]):
    dict_inputs = feature_space.get_inputs()  # need to use unprocessed features here, to gain access to each output
    encoded_features = (
        feature_space.get_encoded_features()
    )  # these features have been encoded

    wide_branch = build_wide_branches(dict_inputs, crossed_columns)
    deep_branch = build_deep_branches(encoded_features, hidden_units)

    # merge the deep and wide branch
    final_branch = Concatenate(name="concat_deep_wide")(
        [deep_branch, wide_branch]
    )
    final_branch = Dense(units=1, activation="sigmoid", name="combined")(
        final_branch
    )

    # encoded features input, fast
    training_model = keras.Model(
        inputs=encoded_features, outputs=final_branch
    )
    training_model.compile(
        optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
    )

    training_model.summary()

    # non-encoded, perform redundant operations
    inference_model = keras.Model(inputs=dict_inputs, outputs=final_branch)
    inference_model.compile(
        loss="binary_crossentropy", metrics=["accuracy"]
    )

    plot_model(
        training_model,
        to_file="model.png",
        show_shapes=True,
        show_layer_names=True,
        rankdir="LR",
        expand_nested=False,
        dpi=96,
    )
    return training_model

In [35]:
feature_space.get_encoded_features()

<KerasTensor shape=(None, 108), dtype=float32, sparse=False, ragged=False, name=keras_tensor_63>

In [36]:
crossed_columns = [
    ("occupation", "credit_mix"),
    ("payment_of_min_amount", "payment_behaviour"),
]
hidden_units = [64, 32]
training_model = build_wide_deep_networks(crossed_columns, hidden_units)


history = training_model.fit(
    processed_train_ds,
    epochs=10,
    validation_data=processed_test_ds,
    verbose=2,
)

AttributeError: 'FeatureSpace' object has no attribute 'get_crossed_encoded_features'