Import Libraries


In [None]:
import pandas as pd
import numpy as np

n = 250

def generate_credit_data(seed):
    np.random.seed(seed)

    income = np.random.randint(20000, 120000, n)
    credit = np.random.randint(350, 900, n)
    emis = np.random.randint(0, 6, n)
    defaults = np.random.choice([0,1], n, p=[0.8,0.2])
    employment = np.random.choice(["salaried", "self-employed"], n)

    risk = ((credit < 600) | (defaults == 1) | (emis > 3)).astype(int)

    return pd.DataFrame({
        "monthly_income": income,
        "credit_score": credit,
        "existing_emis": emis,
        "employment_type": employment,
        "past_defaults": defaults,
        "risk_label": risk
    })

generate_credit_data(1).to_csv("credit_risk_groupA.csv", index=False)
generate_credit_data(2).to_csv("credit_risk_groupB.csv", index=False)
generate_credit_data(3).to_csv("credit_risk_groupC.csv", index=False)
generate_credit_data(4).to_csv("credit_risk_groupD.csv", index=False)

print("✅ All 4 datasets generated")


✅ All 4 datasets generated


# Load Dataset

In [None]:
import pandas as pd

# Organizer trial – use Group A
df = pd.read_csv("credit_risk_groupB.csv")
df.head()


Unnamed: 0,monthly_income,credit_score,existing_emis,employment_type,past_defaults,risk_label
0,109256,583,5,salaried,1,1
1,92173,413,2,salaried,0,1
2,115816,454,2,self-employed,0,1
3,64566,866,3,salaried,0,0
4,51019,857,2,salaried,1,1


# Preprocessing

In [None]:
df["employment_type"] = df["employment_type"].map({
    "salaried": 0,
    "self-employed": 1
})

X = df.drop("risk_label", axis=1)
y = df["risk_label"]


# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Scaling & Model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train, y_train)


# Evaluation



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))


Accuracy: 0.94

Confusion Matrix:
 [[12  3]
 [ 0 35]]

Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89        15
           1       0.92      1.00      0.96        35

    accuracy                           0.94        50
   macro avg       0.96      0.90      0.92        50
weighted avg       0.94      0.94      0.94        50



# Feature Impact

In [None]:
importance = pd.DataFrame({
    "feature": X.columns,
    "impact": model.coef_[0]
}).sort_values(by="impact", ascending=False)

importance


Unnamed: 0,feature,impact
2,existing_emis,1.744914
4,past_defaults,1.59439
3,employment_type,0.019411
0,monthly_income,0.01301
1,credit_score,-2.627276



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



### Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

### Random Forest Evaluation

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest Report:\n", classification_report(y_test, y_pred_rf))

### Random Forest Feature Importance

In [None]:
rf_importance = pd.DataFrame({
    "feature": X.columns,
    "impact": rf_model.feature_importances_
}).sort_values(by="impact", ascending=False)

print("Random Forest Feature Importances:")
print(rf_importance)

In [None]:
import kagglehub
# You may need to re-run this cell after logging in.
kagglehub.login()
path = kagglehub.competition_download('jane-street-market-prediction')