In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix)
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv('/content/Delinquency_prediction_dataset (1).csv')

# Select relevant features from your dataset
features = [
    'Age',
    'Income',
    'Credit_Score',
    'Credit_Utilization',
    'Missed_Payments',
    'Loan_Balance',
    'Debt_to_Income_Ratio',
    'Account_Tenure'
]

X = data[features]
y = data['Delinquent_Account']  # Target variable

# Split data into train/test sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Impute missing values and scale features
imputer = SimpleImputer(strategy='median')  # Replace missing values with median
scaler = StandardScaler()  # Standardize features

X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)

X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

# Predict probabilities and classes
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]  # Probability of delinquency
y_pred = model.predict(X_test_scaled)  # Binary prediction (0/1)

# Evaluate model performance
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature Importance (coefficients)
print("\nFeature Importance (Logistic Regression Coefficients):")
for feature, coef in zip(features, model.coef_[0]):
    print(f"{feature}: {coef:.4f}")

# Fairness Check (example: compare performance across age groups)
data_test = data.loc[y_test.index]  # Get test set data for fairness checks

# Check for Age bias (split into two groups: <=35 and >35)
age_groups = np.where(data_test['Age'] <= 35, '<=35', '>35')
for group in np.unique(age_groups):
    group_mask = (age_groups == group)
    print(f"\nPerformance for Age {group}:")
    print(f"  Precision: {precision_score(y_test[group_mask], y_pred[group_mask]):.4f}")
    print(f"  Recall: {recall_score(y_test[group_mask], y_pred[group_mask]):.4f}")

Model Evaluation Metrics:
Accuracy: 0.8600
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
AUC-ROC: 0.4792

Confusion Matrix:
[[86  0]
 [14  0]]

Feature Importance (Logistic Regression Coefficients):
Age: 0.0568
Income: 0.1094
Credit_Score: 0.1511
Credit_Utilization: 0.1508
Missed_Payments: -0.0711
Loan_Balance: -0.0303
Debt_to_Income_Ratio: 0.1638
Account_Tenure: -0.0512

Performance for Age <=35:
  Precision: 0.0000
  Recall: 0.0000

Performance for Age >35:
  Precision: 0.0000
  Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 1.Add Other Fairness Checks:

In [3]:
# Example: Location-based fairness
for location in data_test['Location'].unique():
    loc_mask = (data_test['Location'] == location)
    print(f"\nPerformance for Location = {location}:")
    print(f"  Precision: {precision_score(y_test[loc_mask], y_pred[loc_mask]):.4f}")
    print(f"  Recall: {recall_score(y_test[loc_mask], y_pred[loc_mask]):.4f}")


Performance for Location = Phoenix:
  Precision: 0.0000
  Recall: 0.0000

Performance for Location = Houston:
  Precision: 0.0000
  Recall: 0.0000

Performance for Location = New York:
  Precision: 0.0000
  Recall: 0.0000

Performance for Location = Chicago:
  Precision: 0.0000
  Recall: 0.0000

Performance for Location = Los Angeles:
  Precision: 0.0000
  Recall: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


2.Handle Categorical Features (e.g., 'Employment_Status', 'Credit_Card_Type')

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Example for 'Employment_Status'
encoder = OneHotEncoder(drop='first', sparse_output=False)
employment_encoded = encoder.fit_transform(data[['Employment_Status']])
employment_df = pd.DataFrame(employment_encoded, columns=encoder.get_feature_names_out(['Employment_Status']))
X = pd.concat([X, employment_df], axis=1)