In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report
import os

In [2]:
# Directory and file setup
data_dir = '/Users/dr/Documents/GitHub/MBS_RiskManagement/'
extracted_data_dir = os.path.join(data_dir, 'extracted_data')
input_file = os.path.join(extracted_data_dir, 'regression_data.csv')

In [3]:
# Load the data
df = pd.read_csv(input_file)

In [6]:
df.head()

Unnamed: 0,Loan Sequence Number,Credit Score,Occupancy Status,Original Combined Loan-to-Value (CLTV),Original Debt-to-Income (DTI) Ratio,Original UPB,Original Loan-to-Value (LTV),Original Interest Rate,Property State,Original Loan Term,Number of Borrowers,Current Actual UPB,Current Loan Delinquency Status,Loan Age,Remaining Months to Legal Maturity,Current Interest Rate,Current Deferred UPB,Estimated Loan-to-Value (ELTV),Default
0,,629.0,P,77.0,45.0,324000.0,71.0,3.875,KY,180.0,2.0,0.0,0.0,74.0,106.0,3.875,0.0,50.0,0
1,F14Q10000001,770.0,P,89.0,30.0,65000.0,89.0,3.375,NY,180.0,2.0,0.0,0.0,40.0,140.0,3.375,0.0,999.0,0
2,F14Q10000002,674.0,P,89.0,999.0,182000.0,76.0,3.375,MI,180.0,1.0,0.0,0.0,75.0,105.0,3.375,0.0,999.0,0
3,F14Q10000003,717.0,I,77.0,41.0,107000.0,77.0,5.25,RI,360.0,2.0,84852.01,0.0,132.0,228.0,5.25,0.0,21.0,0
4,F14Q10000004,813.0,P,95.0,32.0,165000.0,95.0,4.125,IA,360.0,1.0,0.0,3.0,47.0,313.0,4.125,0.0,999.0,1


In [7]:
# Define features and target
target = 'Default'
features = [col for col in df.columns if col not in [target, 'Loan Sequence Number']]
X = df[features]
y = df[target]

In [9]:
# Initialize lists to store chunks
chunks = []

# Set chunksize (adjust based on memory availability, e.g., 100,000)
chunksize = 100000

# First pass: Compute mean and mode for imputation
first_chunk = True
numeric_means = {}
categorical_modes = {}

for chunk in pd.read_csv(input_file, usecols=features + [target, 'Loan Sequence Number'], chunksize=chunksize):
    if first_chunk:
        # Compute means for numeric columns
        numeric_cols = chunk.select_dtypes(include=['int64', 'float64']).columns
        numeric_means = chunk[numeric_cols].mean().to_dict()
        
        # Compute modes for categorical columns
        categorical_cols = chunk.select_dtypes(include=['object']).columns
        categorical_modes = {col: chunk[col].mode()[0] for col in categorical_cols}
        first_chunk = False
    
    # Impute missing values in the chunk
    for col in numeric_cols:
        chunk[col] = chunk[col].fillna(numeric_means[col])
    for col in categorical_cols:
        chunk[col] = chunk[col].fillna(categorical_modes[col])
    
    chunks.append(chunk)

# Concatenate all chunks
df = pd.concat(chunks, ignore_index=True)

In [10]:
# Define features and target after imputation
X = df[features]
y = df[target]

In [12]:
# Encode categorical variables with .loc to avoid SettingWithCopyWarning
categorical_cols = X.select_dtypes(include=['object']).columns
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])  # Use .loc for assignment
    encoders[col] = le

In [14]:
# Scale numeric features with .loc to avoid SettingWithCopyWarning
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X.loc[:, numeric_cols] = scaler.fit_transform(X[numeric_cols])  # Use .loc for assignment

In [15]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
# Fit logistic regression with class weighting for imbalance
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

In [17]:
# Predict and evaluate
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [18]:
# Metrics
roc_auc = roc_auc_score(y_test, y_pred_prob)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

ROC-AUC Score: 1.0000
Precision: 1.0000
Recall: 1.0000
Confusion Matrix:
[[4022549       0]
 [      1   42075]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   4022549
           1       1.00      1.00      1.00     42076

    accuracy                           1.00   4064625
   macro avg       1.00      1.00      1.00   4064625
weighted avg       1.00      1.00      1.00   4064625

