In [1]:
from src.preprocessing.extraction import load_data, preprocess_data, classify_loans
from src.preprocessing.data_cleaning import define_features

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder, CountEncoder

from typing import List


class DataEncoder:
    def __init__(self, num_feats: List[str], target_feats: List[str], freq_feats: List[str]):
        self.num_feats = num_feats
        self.target_feats = target_feats
        self.freq_feats = freq_feats
        self.target_encoder = TargetEncoder(cols=target_feats)
        self.freq_encoder = CountEncoder(cols=freq_feats)

    def fit_transform(self, train_df: pd.DataFrame, target: str):
        # Encode target features
        train_target_encoded = self.target_encoder.fit_transform(train_df[self.target_feats], train_df[target])
        train_target_encoded.columns = [f"{col}_target" for col in self.target_feats]
        
        # Encode frequency features
        train_freq_encoded = self.freq_encoder.fit_transform(train_df[self.freq_feats])
        train_freq_encoded.columns = [f"{col}_freq" for col in self.freq_feats]
        
        # Select numerical features
        train_num_feats = train_df[self.num_feats]
        
        # Combine all features
        train_encoded = pd.concat([train_target_encoded, train_freq_encoded, train_num_feats], axis=1)
        
        return train_encoded

    def transform(self, test_df: pd.DataFrame):
        # Encode target features
        test_target_encoded = self.target_encoder.transform(test_df[self.target_feats])
        test_target_encoded.columns = [f"{col}_target" for col in self.target_feats]
        
        # Encode frequency features
        test_freq_encoded = self.freq_encoder.transform(test_df[self.freq_feats])
        test_freq_encoded.columns = [f"{col}_freq" for col in self.freq_feats]
        
        # Select numerical features
        test_num_feats = test_df[self.num_feats]
        
        # Combine all features
        test_encoded = pd.concat([test_target_encoded, test_freq_encoded, test_num_feats], axis=1)
        
        return test_encoded

In [2]:
loan_path = "./data/raw/loan.csv"
payment_path = "./data/raw/payment.csv"
underwriting_path = "./data/raw/clarity_underwriting_variables.csv"

# Load data
loan_df, payment_df, underwriting_df = load_data(loan_path, payment_path, underwriting_path)

# Preprocess data
df = preprocess_data(loan_df, underwriting_df)

# Classify loans
df = classify_loans(df)

num_feats, freq_feats, target_feats, predictor = define_features()

selected_features = list(set(num_feats + freq_feats + target_feats + predictor))

# Split data into train and test sets
df = df[selected_features]
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_df = train_df[selected_features]

# Initialize the encoder
encoder = DataEncoder(num_feats, target_feats, freq_feats)

# Encode the training data
X_train = encoder.fit_transform(train_df, predictor)
y_train = train_df[predictor]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

X_test = encoder.transform(test_df)
y_test = test_df[predictor]

  underwriting_df = pd.read_csv(underwriting_path)


Loan df shape :(577682, 19)
Payment df shape :(689364, 9)
Underwriting df shape :(49752, 54)


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [20]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import lightgbm as lgb
from sklearn.metrics import f1_score

# Create LightGBM datasets
dtrain = lgb.Dataset(X_train, label=y_train)
dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)

# Hyperparameter tuning with hyperopt and lightgbm
def objective(params):
    params['objective'] = 'binary'
    params['metric'] = 'auc'
    params['verbosity'] = -1

    # Remove n_estimators from params, because we pass it as a function argument
    n_estimators = params['n_estimators']
    del params['n_estimators']
    params['num_leaves'] = int(params['num_leaves'])
    
    model = lgb.train(params, train_set=dtrain, valid_sets=[dval], num_boost_round=n_estimators,callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)])
    preds = model.predict(X_val, num_iteration=model.best_iteration)
    preds_binary = np.round(preds)  # Convert probabilities to binary
    f1 = f1_score(y_val, preds_binary)
    return {'loss': -f1, 'status': STATUS_OK}

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'lambda_l2': hp.quniform('lambda_l2', 0, 10, 1),
    'num_leaves': hp.quniform('num_leaves', 30, 80, 5),
    'n_estimators': hp.choice('n_estimators', range(50, 500)),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
}

trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=200, trials=trials, rstate=np.random.default_rng(42))

print("Best Parameters:", best_params)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:52<00:00,  3.81trial/s, best loss: -0.6863805339641771]
Best Parameters: {'colsample_bytree': np.float64(0.7017847706622431), 'lambda_l2': np.float64(3.0), 'learning_rate': np.float64(0.07576253355055539), 'n_estimators': np.int64(30), 'num_leaves': np.float64(35.0), 'subsample': np.float64(0.7514518773307409)}


In [21]:
best_params

{'colsample_bytree': np.float64(0.7017847706622431),
 'lambda_l2': np.float64(3.0),
 'learning_rate': np.float64(0.07576253355055539),
 'n_estimators': np.int64(30),
 'num_leaves': np.float64(35.0),
 'subsample': np.float64(0.7514518773307409)}

In [23]:
best_params['objective'] = 'binary'
best_params['metric'] = 'auc'
best_params['verbosity'] = -1
best_params["num_leaves"] = int(best_params['num_leaves'])

# Combine train and validation sets for final training
X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])

dtrain_full = lgb.Dataset(X_train_full, label=y_train_full)

# Train final model on combined train and validation sets
final_model = lgb.train(best_params, train_set=dtrain_full, num_boost_round=best_params['n_estimators'])



In [25]:
import joblib
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report

# Save the final model and the encoder object
joblib.dump(final_model, 'lgb_model.joblib')
joblib.dump(encoder, 'encoder.joblib')

# Save train_df and test_df using joblib
joblib.dump(train_df, 'train_df.joblib')
joblib.dump(test_df, 'test_df.joblib')

# Load the final model and the encoder object
loaded_model = joblib.load('lgb_model.joblib')
loaded_encoder = joblib.load('encoder.joblib')

# Load the saved dataframes
saved_train_df = joblib.load('train_df.joblib')
saved_test_df = joblib.load('test_df.joblib')

# Inference on test data
X_test_encoded = loaded_encoder.transform(saved_test_df)
predictions = loaded_model.predict(X_test_encoded)
binary_predictions = np.round(predictions)

# Evaluate the model on the test set
test_f1_score = f1_score(y_test, binary_predictions)
test_precision = precision_score(y_test, binary_predictions)
test_recall = recall_score(y_test, binary_predictions)
test_accuracy = accuracy_score(y_test, binary_predictions)

# Evaluate the model on the training set
train_predictions = loaded_model.predict(X_train_full)
train_binary_predictions = np.round(train_predictions)

train_f1_score = f1_score(y_train_full, train_binary_predictions)
train_precision = precision_score(y_train_full, train_binary_predictions)
train_recall = recall_score(y_train_full, train_binary_predictions)
train_accuracy = accuracy_score(y_train_full, train_binary_predictions)

print("Final Training Scores:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1_score:.4f}")

print("\nTest Scores:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1_score:.4f}")

# Detailed classification report
print("\nClassification Report on Test Data:")
print(classification_report(y_test, binary_predictions))

Final Training Scores:
Accuracy: 0.7057
Precision: 0.7418
Recall: 0.6199
F1 Score: 0.6754

Test Scores:
Accuracy: 0.6903
Precision: 0.7235
Recall: 0.6093
F1 Score: 0.6615

Classification Report on Test Data:
              precision    recall  f1-score   support

         0.0       0.67      0.77      0.71      3924
         1.0       0.72      0.61      0.66      3873

    accuracy                           0.69      7797
   macro avg       0.69      0.69      0.69      7797
weighted avg       0.69      0.69      0.69      7797



  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [12]:
# from category_encoders import TargetEncoder, CountEncoder
# from typing import List, Tuple

# # Function to encode categorical features using Target Encoder
# def encode_target_features(train_df: pd.DataFrame, test_df: pd.DataFrame, target_feats: List[str], target: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
#     encoder = TargetEncoder(cols=target_feats)
#     train_encoded = encoder.fit_transform(train_df[target_feats], train_df[target])
#     test_encoded = encoder.transform(test_df[target_feats])
    
#     train_encoded.columns = [f"{col}_target" for col in target_feats]
#     test_encoded.columns = [f"{col}_target" for col in target_feats]
    
#     return train_encoded, test_encoded

# # Function to encode frequency features using Count Encoder
# def encode_frequency_features(train_df: pd.DataFrame, test_df: pd.DataFrame, freq_feats: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
#     encoder = CountEncoder(cols=freq_feats)
#     train_encoded = encoder.fit_transform(train_df[freq_feats])
#     test_encoded = encoder.transform(test_df[freq_feats])
    
#     train_encoded.columns = [f"{col}_freq" for col in freq_feats]
#     test_encoded.columns = [f"{col}_freq" for col in freq_feats]
    
#     return train_encoded, test_encoded

# # Function to select numerical (floating point) features
# def select_numerical_features(train_df: pd.DataFrame, test_df: pd.DataFrame, num_feats: List[str]):
#     return train_df[num_feats], test_df[num_feats]


In [13]:
# # Encode target and frequency features
# train_target_encoded, test_target_encoded = encode_target_features(train_df, test_df, target_feats, predictor)
# train_freq_encoded, test_freq_encoded = encode_frequency_features(train_df, test_df, freq_feats)
# train_num_feats, test_num_feats = select_numerical_features(train_df, test_df, num_feats)

# y_train, y_test = train_df[predictor], test_df[predictor]

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [34]:
# # Combine the encoded features with the numerical features
# train_df = pd.concat([train_num_feats, train_target_encoded, train_freq_encoded, y_train], axis=1).reset_index(drop=True)
# test_df = pd.concat([test_num_feats, test_target_encoded, test_freq_encoded, y_test], axis=1).reset_index(drop=True)


In [15]:
train_df

Unnamed: 0,apr,loanAmount,originallyScheduledPaymentAmount,leadCost,app_processing_hours,clearfraudscore,payFrequency_target,nPaidOff_target,state_target,fpStatus_target,payFrequency_freq,nPaidOff_freq,state_freq,fpStatus_freq,target
0,359.0,800.0,1539.40,25,0.726467,728.0,0.493952,0.545654,0.403541,0.419888,18188,24609,1073,26388,0.0
1,360.0,500.0,1045.21,6,25.905145,805.0,0.493952,0.545654,0.391882,0.419888,18188,24609,4139,26388,1.0
2,490.0,300.0,764.79,3,13.565917,446.0,0.493952,0.545654,0.566138,0.419888,18188,24609,1701,26388,1.0
3,360.0,700.0,1550.82,25,0.272742,781.0,0.528601,0.545654,0.391882,0.419888,8199,24609,4139,26388,0.0
4,590.0,500.0,1579.57,3,1.240307,643.0,0.439845,0.545654,0.523664,0.945854,2319,24609,1310,4414,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31179,360.0,500.0,1077.13,3,12.080581,591.0,0.493952,0.545654,0.391882,0.945854,18188,24609,4139,4414,1.0
31180,490.0,600.0,1710.65,0,48.634175,665.0,0.449343,0.545654,0.566138,0.419888,2132,24609,1701,26388,1.0
31181,680.0,500.0,1803.12,3,1.863360,526.0,0.528601,0.545654,0.570486,0.419888,8199,24609,2426,26388,0.0
31182,390.0,300.0,625.98,3,3.560413,,0.493952,0.545654,0.566138,0.419888,18188,24609,1701,26388,1.0


In [16]:
test_df

Unnamed: 0,apr,loanAmount,originallyScheduledPaymentAmount,leadCost,app_processing_hours,clearfraudscore,payFrequency_target,nPaidOff_target,state_target,fpStatus_target,payFrequency_freq,nPaidOff_freq,state_freq,fpStatus_freq,target
0,390.0,300.0,589.40,0,3.386637,,0.439845,0.349638,0.570486,0.576271,2319,4007,2426,177,0.0
1,645.0,1500.0,4606.13,25,0.199506,698.0,0.493952,0.545654,0.443205,0.419888,18188,24609,986,26388,1.0
2,645.0,650.0,2089.89,25,0.180332,818.0,0.493952,0.545654,0.443205,0.419888,18188,24609,986,26388,0.0
3,601.0,1500.0,4497.87,3,78.048435,701.0,0.493952,0.545654,0.461601,0.419888,18188,24609,1237,26388,0.0
4,680.0,500.0,1704.04,3,6.493419,515.0,0.493952,0.545654,0.570486,0.419888,18188,24609,2426,26388,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7792,199.0,3000.0,6683.94,0,26.600239,843.0,0.493952,0.545654,0.523664,0.419888,18188,24609,1310,26388,0.0
7793,590.0,1000.0,2778.44,0,0.070065,,0.449343,0.253762,0.533058,0.419888,2132,1462,1452,26388,1.0
7794,590.0,300.0,893.30,3,30.880908,637.0,0.493952,0.545654,0.559878,0.419888,18188,24609,5244,26388,1.0
7795,601.0,600.0,1657.85,6,10.985854,507.0,0.493952,0.545654,0.461601,0.419888,18188,24609,1237,26388,1.0


In [49]:
train_df, val_df = split_data(train_df)

In [18]:
train_df

Unnamed: 0,apr,loanAmount,originallyScheduledPaymentAmount,leadCost,app_processing_hours,clearfraudscore,payFrequency_target,nPaidOff_target,state_target,fpStatus_target,payFrequency_freq,nPaidOff_freq,state_freq,fpStatus_freq,target
964,625.00,500.0,1448.15,3,0.106504,849.0,0.493952,0.545654,0.559878,0.419888,18188,24609,5244,26388,1.0
13938,449.99,300.0,728.07,6,17.153786,,0.493952,0.545654,0.446644,0.945854,18188,24609,2071,4414,1.0
18253,390.00,300.0,625.98,3,0.850988,,0.493952,0.545654,0.570486,0.419888,18188,24609,2426,26388,1.0
26025,705.59,200.0,642.84,10,14.750792,,0.493952,0.545654,0.559878,0.419888,18188,24609,5244,26388,0.0
17335,590.00,300.0,753.41,10,7.774151,,0.439845,0.545654,0.570486,0.945854,2319,24609,2426,4414,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,585.00,437.0,1310.56,0,6.301507,626.0,0.493952,0.349638,0.602662,0.945854,18188,4007,1052,4414,1.0
5390,645.00,400.0,1344.01,3,0.270455,592.0,0.493952,0.545654,0.443205,0.419888,18188,24609,986,26388,0.0
860,650.00,400.0,1252.48,40,2.318055,836.0,0.528601,0.545654,0.468795,0.419888,8199,24609,1378,26388,0.0
15795,680.00,350.0,1201.91,6,22.476288,524.0,0.439845,0.545654,0.570486,0.945854,2319,24609,2426,4414,1.0


In [19]:
train_df.columns

Index(['apr', 'loanAmount', 'originallyScheduledPaymentAmount', 'leadCost',
       'app_processing_hours', 'clearfraudscore', 'payFrequency_target',
       'nPaidOff_target', 'state_target', 'fpStatus_target',
       'payFrequency_freq', 'nPaidOff_freq', 'state_freq', 'fpStatus_freq',
       'target'],
      dtype='object')

In [20]:
predictor

['target']

In [21]:
train_df[predictor]

Unnamed: 0,target
964,1.0
13938,1.0
18253,1.0
26025,0.0
17335,1.0
...,...
29802,1.0
5390,0.0
860,0.0
15795,1.0


In [23]:
# Separate features and target
X_train = train_df.drop(columns=predictor)
y_train = train_df[predictor]
X_val = val_df.drop(columns=predictor)
y_val = val_df[predictor]

# Separate features and target
X_train = test_df.drop(columns=predictor)
y_train = test_df[predictor]

In [24]:
X_train

Unnamed: 0,apr,loanAmount,originallyScheduledPaymentAmount,leadCost,app_processing_hours,clearfraudscore,payFrequency_target,nPaidOff_target,state_target,fpStatus_target,payFrequency_freq,nPaidOff_freq,state_freq,fpStatus_freq
964,625.00,500.0,1448.15,3,0.106504,849.0,0.493952,0.545654,0.559878,0.419888,18188,24609,5244,26388
13938,449.99,300.0,728.07,6,17.153786,,0.493952,0.545654,0.446644,0.945854,18188,24609,2071,4414
18253,390.00,300.0,625.98,3,0.850988,,0.493952,0.545654,0.570486,0.419888,18188,24609,2426,26388
26025,705.59,200.0,642.84,10,14.750792,,0.493952,0.545654,0.559878,0.419888,18188,24609,5244,26388
17335,590.00,300.0,753.41,10,7.774151,,0.439845,0.545654,0.570486,0.945854,2319,24609,2426,4414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29802,585.00,437.0,1310.56,0,6.301507,626.0,0.493952,0.349638,0.602662,0.945854,18188,4007,1052,4414
5390,645.00,400.0,1344.01,3,0.270455,592.0,0.493952,0.545654,0.443205,0.419888,18188,24609,986,26388
860,650.00,400.0,1252.48,40,2.318055,836.0,0.528601,0.545654,0.468795,0.419888,8199,24609,1378,26388
15795,680.00,350.0,1201.91,6,22.476288,524.0,0.439845,0.545654,0.570486,0.945854,2319,24609,2426,4414


In [25]:
y_train

Unnamed: 0,target
964,1.0
13938,1.0
18253,1.0
26025,0.0
17335,1.0
...,...
29802,1.0
5390,0.0
860,0.0
15795,1.0


In [51]:
train_df.head()

Unnamed: 0,app_processing_hours,apr,state,nPaidOff,target,fpStatus,leadCost,clearfraudscore,loanAmount,payFrequency,originallyScheduledPaymentAmount
3918,0.106504,625.0,OH,0.0,1.0,Checked,3,849.0,500.0,B,1448.15
2271,17.153786,449.99,WI,0.0,1.0,Rejected,6,,300.0,B,728.07
1902,0.850988,390.0,TX,0.0,1.0,Checked,3,,300.0,B,625.98
178,14.750792,705.59,OH,0.0,0.0,Checked,10,,200.0,B,642.84
1942,7.774151,590.0,TX,0.0,1.0,Rejected,10,,300.0,M,753.41


In [52]:
target_feats

['payFrequency', 'nPaidOff', 'state', 'fpStatus']

Index(['app_processing_hours', 'apr', 'state', 'nPaidOff', 'target',
       'fpStatus', 'leadCost', 'clearfraudscore', 'loanAmount', 'payFrequency',
       'originallyScheduledPaymentAmount'],
      dtype='object')
      payFrequency  nPaidOff state  fpStatus
3918             B       0.0    OH   Checked
2271             B       0.0    WI  Rejected
1902             B       0.0    TX   Checked
178              B       0.0    OH   Checked
1942             M       0.0    TX  Rejected
...            ...       ...   ...       ...
13507            B       1.0    TN  Rejected
27552            B       0.0    NJ   Checked
8821             W       0.0    IN   Checked
28301            M       0.0    TX  Rejected
17095            W       1.0    OH   Checked

[24947 rows x 4 columns]
       target
3918      1.0
2271      1.0
1902      1.0
178       0.0
1942      1.0
...       ...
13507     1.0
27552     0.0
8821      0.0
28301     1.0
17095     0.0

[24947 rows x 1 columns]


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [57]:
X_val

Unnamed: 0,app_processing_hours,apr,state,nPaidOff,fpStatus,leadCost,clearfraudscore,loanAmount,payFrequency,originallyScheduledPaymentAmount
23658,2.346357,645.000,NJ,0.0,Checked,3,625.0,500.0,S,1479.25
3866,2.928984,360.000,IL,0.0,Checked,0,840.0,900.0,B,1825.53
2951,0.068638,525.000,OH,0.0,Checked,3,693.0,350.0,B,927.67
5055,1.707431,449.990,WI,0.0,Checked,0,657.0,500.0,W,709.53
9972,4.860859,404.991,WI,1.0,Checked,0,459.0,400.0,W,565.89
...,...,...,...,...,...,...,...,...,...,...
15364,86.588943,404.100,WI,1.0,Checked,0,,900.0,B,1732.44
28620,0.425196,645.000,FL,0.0,Checked,25,560.0,400.0,B,1228.50
1038,473.514961,525.000,OH,0.0,Checked,3,,300.0,B,844.07
9145,1.809180,590.000,OH,0.0,Checked,10,838.0,600.0,I,1618.99


100%|██████████| 200/200 [00:36<00:00,  5.46trial/s, best loss: -0.6721380471380471]
Best Parameters: {'colsample_bytree': np.float64(0.6361219270472528), 'learning_rate': np.float64(0.19253804925604365), 'n_estimators': np.int64(174), 'subsample': np.float64(0.5240436338424228)}


{'colsample_bytree': np.float64(0.8258578389447909),
 'learning_rate': np.float64(0.14877007763863245),
 'n_estimators': np.int64(175),
 'subsample': np.float64(0.7012949446556633)}