In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
from fastapi import FastAPI
from pydantic import BaseModel

In [2]:
# Load dataset
df = pd.read_csv("banking_risk_dataset.csv")

In [3]:
# Define features and target
features = ["Age", "Gender", "Region", "Income", "Existing_Loans", "Loan_Amount", "Credit_Card_Debt", "Savings", "Investments", "Credit_Score", "Employment_Status"]
target = "Risk_Score"

In [4]:
# Split dataset
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Preprocessing pipeline
numeric_features = ["Age", "Income", "Loan_Amount", "Credit_Card_Debt", "Savings", "Investments", "Credit_Score"]
categorical_features = ["Gender", "Region", "Existing_Loans", "Employment_Status"]

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [6]:
# Base models
rf = RandomForestRegressor(n_estimators=100, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
xgb = XGBRegressor(n_estimators=100, random_state=42)

In [7]:
# Stacking ensemble
stacking_model = StackingRegressor(
    estimators=[("rf", rf), ("gb", gb), ("xgb", xgb)],
    final_estimator=RandomForestRegressor(n_estimators=50, random_state=42)
)

In [8]:
# Hyperparameter tuning for RandomForest
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20],
    "min_samples_split": [2, 5]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='r2')
grid_search.fit(X_train.select_dtypes(include=[np.number]), y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [10, 20], 'min_samples_split': [2, 5],
                         'n_estimators': [100, 200]},
             scoring='r2')

In [9]:
# Best model
best_rf = grid_search.best_estimator_

In [10]:
# Final model pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", stacking_model)
])

In [11]:
# Train model
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age', 'Income',
                                                   'Loan_Amount',
                                                   'Credit_Card_Debt',
                                                   'Savings', 'Investments',
                                                   'Credit_Score']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Gender', 'Region',
                                                   'Existing_Loans',
                                                   'Employment_Status'])])),
                ('regressor',
                 StackingRegressor(estimators=[('rf',
                                                RandomForestRegr...
                         

In [12]:
# Evaluate model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Absolute Error: 2.54
R-squared: 0.99


In [13]:
# Save model
joblib.dump(model, "risk_score_model_v3.pkl")
print("Model saved as risk_score_model_v3.pkl")

Model saved as risk_score_model_v3.pkl


In [14]:
# Load model for API
model = joblib.load("risk_score_model_v3.pkl")

In [15]:
# FastAPI setup
app = FastAPI()

In [16]:
# Define request schema
class CustomerData(BaseModel):
    Age: int
    Gender: str
    Region: str
    Income: float
    Existing_Loans: str
    Loan_Amount: float
    Credit_Card_Debt: float
    Savings: float
    Investments: float
    Credit_Score: float
    Employment_Status: str

In [17]:
# API endpoint to predict risk score
@app.post("/predict")
def predict_risk(data: CustomerData):
    input_data = pd.DataFrame([data.dict()])
    risk_score = model.predict(input_data)[0]
    return {"Risk_Score": round(risk_score, 2)}

In [18]:
# Run API using: uvicorn filename:app --reload