# Q1

In [4]:
!pip install tensorflow
!pip install scikit-learn
!pip install pyarrow
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m121.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m142.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, 

In [32]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.impute import KNNImputer
from sklearn.model_selection import KFold

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Data Statistics

## Helper Functions

In [7]:
def display_data(train):
    # show train data head
    display(train.head())

    # print training dimensions
    print(f'training dimensions: {train.shape}')

    # print out data descriptions and data types
    display(train.describe())
    display(train.info())


def summarize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Summarizes key statistics for each column in the DataFrame.
    Returns a summary DataFrame.
    """
    summary = []

    for col in df.columns:
        col_data = df[col]
        col_dtype = col_data.dtype

        # Handle numerical columns only for stats that require arithmetic
        if pd.api.types.is_numeric_dtype(col_data):
            col_min = col_data.min(skipna=True)
            col_max = col_data.max(skipna=True)
            col_range = col_max - col_min
            col_mean = col_data.mean(skipna=True)
            col_median = col_data.median(skipna=True)
            col_var = col_data.var(skipna=True)
            col_pos_inf = np.isposinf(col_data).sum()
            col_neg_inf = np.isneginf(col_data).sum()
        else:
            col_min = col_max = col_range = col_mean = col_median = col_var = np.nan
            col_pos_inf = col_neg_inf = 0
            print(f'col: {col} unique values: {list(df[col].unique())}')

        null_count = col_data.isnull().sum()
        notnull_count = col_data.notnull().sum()
        unique_count = col_data.nunique(dropna=True)
        percent_null = (null_count / len(df)) * 100

        summary.append({
            'column': col,
            'dtype': col_dtype,
            'min': col_min,
            'max': col_max,
            'range': col_range,
            'mean': col_mean,
            'median': col_median,
            'variance': col_var,
            '+inf count': col_pos_inf,
            '-inf count': col_neg_inf,
            'null count': null_count,
            'non-null count': notnull_count,
            '% null': round(percent_null, 2),
            'unique values': unique_count
        })

    summary_df = pd.DataFrame(summary)
    return summary_df

## Summary Display

In [12]:
# training data
train = pd.read_parquet('./training_data_ps2_1.parquet', engine="fastparquet")

In [13]:
display_data(train)

training dimensions: (41248, 26)
<class 'pandas.core.frame.DataFrame'>
Index: 41248 entries, 0 to 49999
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   age                       41248 non-null  float64 
 1   gender                    41248 non-null  object  
 2   marital_status            41248 non-null  object  
 3   income                    40643 non-null  float64 
 4   education_level           41248 non-null  object  
 5   region                    41248 non-null  object  
 6   credit_score              41248 non-null  float64 
 7   credit_history_length     40632 non-null  float64 
 8   num_credit_accounts       41248 non-null  int64   
 9   debt_to_income_ratio      41248 non-null  float64 
 10  loan_amount               41248 non-null  float64 
 11  employment_length         40018 non-null  float64 
 12  home_ownership            41248 non-null  object  
 13  application_month 

Unnamed: 0,age,gender,marital_status,income,education_level,region,credit_score,credit_history_length,num_credit_accounts,debt_to_income_ratio,loan_amount,employment_length,home_ownership,application_month,application_year,birth_month,monthly_income,annual_debt_payment,credit_utilization,income_rank_in_age_group,favorite_color,zodiac_sign,lucky_number,customer_id,default,generation
0,64.799025,Male,Married,37405.404519,Master,Southwest,850.0,7.255521,6,0.546044,15726.324184,3.061381,Rent,7,2023,9,3117.117043,17394.862328,0.057438,0.462016,Black,Sagittarius,64,419769,1,Boomer
2,35.623183,Female,Married,222415.04358,PhD,Northeast,502.188849,1.243093,9,0.343542,73758.06621,7.476903,Rent,11,2020,2,18534.586965,87486.541698,0.0,0.934655,White,Aquarius,6,905728,1,Gen X
3,18.0,Female,Single,32537.931697,Master,Midwest,767.821597,0.0,8,0.951991,5172.67384,,Rent,8,2023,9,2711.494308,33534.513186,0.41924,0.687334,Orange,Libra,84,406798,1,Gen Z/Millennial
4,35.538915,Male,Married,94658.179973,High School,Midwest,726.25298,8.036533,7,0.739499,7470.415345,3.294087,Mortgage,5,2020,9,7888.181664,70483.702287,0.248995,0.707165,Green,Aries,74,963211,1,Gen X
5,40.308966,Female,Married,66223.027532,Master,Southeast,711.698776,10.966017,2,0.491438,23962.294531,1.091051,Mortgage,7,2020,9,5518.585628,29653.238547,0.367042,0.530887,Red,Aries,54,921143,1,Gen X


Unnamed: 0,age,income,credit_score,credit_history_length,num_credit_accounts,debt_to_income_ratio,loan_amount,employment_length,application_month,application_year,birth_month,monthly_income,annual_debt_payment,credit_utilization,income_rank_in_age_group,lucky_number,customer_id,default
count,41248.0,40643.0,41248.0,40632.0,41248.0,41248.0,41248.0,40018.0,41248.0,41248.0,41248.0,41248.0,41248.0,41248.0,41248.0,41248.0,41248.0,41248.0
mean,40.498122,75481.42,683.759887,6.849559,5.696228,0.555785,25714.422733,5.070533,6.514861,2021.704519,6.475999,6285.11103,35335.87,0.200335,0.499783,50.656395,550413.844453,0.72869
std,14.110044,114532.8,116.22052,6.511978,3.193586,0.245613,50141.170129,5.462642,3.436356,1.069104,3.464564,9515.336738,54250.94,0.136541,0.288669,28.881365,260305.339308,0.444641
min,18.0,12150.0,300.0,0.0,0.0,0.002108,1000.0,0.000139,1.0,2020.0,1.0,1012.5,345.7101,0.0,0.000912,1.0,100021.0,0.0
25%,29.787651,22926.48,604.169439,1.868885,3.0,0.374491,5557.377301,1.532434,4.0,2021.0,3.0,1910.00807,13111.63,0.093165,0.249451,26.0,325127.25,0.0
50%,39.969856,44441.38,690.086237,5.177902,5.0,0.54478,11935.858221,2.781472,7.0,2022.0,6.0,3698.151313,21307.64,0.192906,0.499725,51.0,551646.5,1.0
75%,50.209026,84469.21,776.970249,9.926599,8.0,0.722255,26588.75222,6.94583,9.0,2023.0,10.0,7039.282138,37389.85,0.293005,0.749578,76.0,775476.0,1.0
max,85.0,3125000.0,850.0,50.0,23.0,1.483851,1000000.0,50.0,12.0,2023.0,12.0,260416.666667,1970641.0,0.786949,1.0,100.0,999998.0,1.0


None

In [14]:
# display all data summary statistics
display(summarize_dataframe(train))

col: gender unique values: ['Male', 'Female', 'Other']
col: marital_status unique values: ['Married', 'Single', 'Divorced', 'Widowed']
col: education_level unique values: ['Master', 'PhD', 'High School', 'Bachelor', 'Some College']
col: region unique values: ['Southwest', 'Northeast', 'Midwest', 'Southeast', 'West', 'Northwest']
col: home_ownership unique values: ['Rent', 'Mortgage', 'Own', 'Other']
col: favorite_color unique values: ['Black', 'White', 'Orange', 'Green', 'Red', 'Yellow', 'Blue', 'Purple']
col: zodiac_sign unique values: ['Sagittarius', 'Aquarius', 'Libra', 'Aries', 'Scorpio', 'Cancer', 'Capricorn', 'Virgo', 'Taurus', 'Leo', 'Gemini', 'Pisces']
col: generation unique values: ['Boomer', 'Gen X', 'Gen Z/Millennial', 'Silent']


Unnamed: 0,column,dtype,min,max,range,mean,median,variance,+inf count,-inf count,null count,non-null count,% null,unique values
0,age,float64,18.0,85.0,67.0,40.498122,39.969856,199.0933,0,0,0,41248,0.0,38361
1,gender,object,,,,,,,0,0,0,41248,0.0,3
2,marital_status,object,,,,,,,0,0,0,41248,0.0,4
3,income,float64,12150.0,3125000.0,3112850.0,75481.420234,44441.382941,13117770000.0,0,0,605,40643,1.47,34209
4,education_level,object,,,,,,,0,0,0,41248,0.0,5
5,region,object,,,,,,,0,0,0,41248,0.0,6
6,credit_score,float64,300.0,850.0,550.0,683.759887,690.086237,13507.21,0,0,0,41248,0.0,37046
7,credit_history_length,float64,0.0,50.0,50.0,6.849559,5.177902,42.40586,0,0,616,40632,1.49,37922
8,num_credit_accounts,int64,0.0,23.0,23.0,5.696228,5.0,10.19899,0,0,0,41248,0.0,24
9,debt_to_income_ratio,float64,0.002108,1.483851,1.481743,0.555785,0.54478,0.06032582,0,0,0,41248,0.0,41248


# Model Training

## Helper Functions

In [39]:
def get_preprocessor(df: pd.DataFrame):
    # Make a defensive copy of the input DataFrame to avoid modifying the original one
    df = df.copy()

    # Drop irrelevant or high-cardinality columns that don't contribute meaningful signal to the model
    print("dropping features: 'customer_id', 'favorite_color', 'zodiac_sign', 'lucky_number'")
    df = df.drop(columns=['customer_id', 'favorite_color', 'zodiac_sign', 'lucky_number'], errors='ignore')

    # Identify categorical columns (i.e., those with dtype 'object' or 'category')
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # Identify numerical columns (i.e., those with dtype int or float)
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Remove the target variable from the list of numerical features (to avoid preprocessing it)
    if 'default' in numerical_features:
        print("removing 'default' from numerical_features list")
        numerical_features.remove('default')

    # Create a pipeline for numerical columns:
    # Step 1: Impute missing values using the median
    # Step 2: Standardize the values (zero mean, unit variance)
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Create a pipeline for categorical columns:
    # Step 1: Impute missing values using the most frequent category
    # Step 2: Apply one-hot encoding to convert categories into binary indicator columns
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Ignore unseen categories at inference time
    ])

    # Combine the numerical and categorical pipelines using ColumnTransformer:
    # - Apply num_pipeline to numerical_features
    # - Apply cat_pipeline to categorical_features
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ])

    # Return the preprocessor and the full list of features that will be transformed
    print('number of numerical + categorical features: ', len(numerical_features) + len(categorical_features))
    return preprocessor, numerical_features, categorical_features


# ---------------------- Model Builder ---------------------- #
def build_model(input_dim, neurons1=128, neurons2=64, dropout_rate=0.3, learning_rate=0.001):
    model = Sequential([
        Dense(neurons1, activation='relu', input_dim=input_dim),
        Dropout(dropout_rate),
        Dense(neurons2, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model


# Training Function (dual mode)
def train_model(X_tr, y_tr, X_val=None, y_val=None, epochs=30, batch_size=64, **model_params):
    # Build model with flexible parameters
    model = build_model(X_tr.shape[1], **model_params)

    # Early stopping callback
    early_stop = EarlyStopping(patience=3, restore_best_weights=True)

    # Train with or without validation data
    if X_val is not None and y_val is not None:
        model.fit(X_tr, y_tr, epochs=epochs, batch_size=batch_size,
                  validation_data=(X_val, y_val), callbacks=[early_stop], verbose=1)
    else:
        model.fit(X_tr, y_tr, epochs=epochs, batch_size=batch_size,
                  callbacks=[early_stop], verbose=1)

    return model


# Cross Validation Pipeline
def cross_validate_pipeline(train_df, n_splits=5, epochs=30, batch_size=64, **model_params):
    df = train_df.copy()
    y = df.pop('default')

    # Shuffle before split
    df, y = shuffle(df, y, random_state=42)

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_accuracies = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(df), 1):
        print(f"Fold {fold}")

        # Split data
        df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Preprocess inside the fold
        preprocessor, num_features, cat_features = get_preprocessor(df_train)
        X_train = preprocessor.fit_transform(df_train)
        X_val = preprocessor.transform(df_val)

        # Train model for current fold
        model = train_model(X_train, y_train, X_val, y_val,
                            epochs=epochs, batch_size=batch_size, **model_params)

        # Predict and evaluate
        preds = (model.predict(X_val) > 0.5).astype(int).flatten()
        acc = accuracy_score(y_val, preds)
        fold_accuracies.append(acc)
        print(f"Fold {fold} Accuracy: {acc:.4f}")

    print(f"\nCross-validation results ({n_splits} folds):")
    for i, acc in enumerate(fold_accuracies, 1):
        print(f" - Fold {i}: {acc:.4f}")
    print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")

    return fold_accuracies


## Hyperparameters

In [44]:
# model hyperparameters
model_params = {'neurons1': 128, 'neurons2': 64, 'dropout_rate': 0.3, 'learning_rate': 0.001}
epochs = 30
batch_size = 64

## Cross Validation

In [45]:
# cross validation pipeline
cross_validate_pipeline(train, n_splits=5, epochs=epochs, batch_size=batch_size, **model_params)
print('done')

Fold 1
dropping features: 'customer_id', 'favorite_color', 'zodiac_sign', 'lucky_number'
number of numerical + categorical features:  21
Epoch 1/30
[1m  1/516[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7:45[0m 904ms/step - accuracy: 0.4375 - loss: 0.7918[1m 41/516[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6100 - loss: 0.6489    [1m 82/516[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6561 - loss: 0.6081[1m124/516[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6759 - loss: 0.5875[1m166/516[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.6883 - loss: 0.5745

In [46]:
# Get total number of rows in the training DataFrame
dim = train.shape[0]

# Separate features and target
X_full = train.drop(columns=['default'])  # All input features
y_full = train['default']  # Target labels

# Compute the split index (first 95% for training, last 5% for validation)
split_idx = int(dim * 0.95)

# Split features and labels accordingly
X_tr = X_full.iloc[:split_idx]  # First 95% for training
X_val = X_full.iloc[split_idx:]  # Last 5% for validation
y_tr = y_full.iloc[:split_idx]
y_val = y_full.iloc[split_idx:]

# Build and fit preprocessor on the training data only (important to avoid data leakage)
preprocessor, num_features, cat_features = get_preprocessor(X_tr)
X_tr_transformed = preprocessor.fit_transform(X_tr)
X_val_transformed = preprocessor.transform(X_val)

# Train the model using early stopping with the validation set
model = train_model(X_tr_transformed, y_tr, X_val_transformed, y_val,
                    epochs=epochs,
                    batch_size=batch_size,
                    **model_params
                    )

# Bundle the objects into a dictionary
bundle = {
    'model': model,
    'preprocessor': preprocessor
}

# Save the bundle
joblib.dump(bundle, 'model.pkl')
print("Model and preprocessor saved to 'final_model_bundle.pkl'")

dropping features: 'customer_id', 'favorite_color', 'zodiac_sign', 'lucky_number'
number of numerical + categorical features:  21
Epoch 1/30
[1m  1/613[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m7:40[0m 753ms/step - accuracy: 0.6562 - loss: 0.6992[1m 42/613[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.7242 - loss: 0.5894    [1m 84/613[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.7295 - loss: 0.5660[1m126/613[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.7319 - loss: 0.5531[1m168/613[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.7337 - loss: 0.5452

# Inference Test

In [57]:
# Load model and preprocessor
pkl = joblib.load('./model.pkl')
model = pkl['model']
preprocessor = pkl['preprocessor']

# Load test data (without target if present)
test = pd.read_parquet('./training_data_ps2_1.parquet', engine='fastparquet').drop(columns=['default'], errors='ignore')

# Transform features
X_test = preprocessor.transform(test)

# Predict probabilities
probs = model.predict(X_test)

# Convert probabilities to 0/1 using 0.5 threshold
predictions = (probs > 0.5).astype(int)

# Create a DataFrame for submission
submission_df = pd.DataFrame({'default': predictions.flatten()})

# Save to CSV
submission_df.to_csv('sample_submission.csv', index=False)

print("Predictions saved to sample_submission.csv")
display(submission_df)

[1m   1/1289[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m54s[0m 42ms/step[1m  55/1289[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 935us/step[1m 119/1289[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 857us/step[1m 181/1289[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 841us/step[1m 246/1289[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 823us/step[1m 310/1289[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 815us/step[1m 376/1289[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 806us/step[1m 440/1289[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 803us/step[1m 500/1289[0m [32m━━━━━━━[0

Unnamed: 0,default
0,1
1,1
2,1
3,1
4,1
...,...
41243,1
41244,1
41245,1
41246,1
