In [8]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../data/df_adjusted.csv")

In [3]:
df.head()

Unnamed: 0,Month,Occupation,Type_of_Loan,Credit_Mix,Credit_History_Age,Payment_of_Min_Amount,Payment_Behaviour,Credit_Score,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Utilization_Ratio,Total_EMI_per_month,Changed_Credit_Limit,Amount_invested_monthly,Monthly_Balance,Num_of_Delayed_Payment,Outstanding_Debt,Annual_Income,Num_of_Loan,Age
0,3,12,128,1.067013,265,0.0,3.0,2,1824.843333,3,...,26.82262,49.574949,11.27,80.4153,312.49408,7.0,809.98,19114.12,4.0,23.0
1,2,12,128,2.0,0,0.0,2.0,2,4194.17085,3,...,31.94496,49.574949,11.27,118.28022,284.62915,30.923342,809.98,19114.12,4.0,23.0
2,6,12,128,2.0,267,0.0,1.0,2,4194.17085,3,...,28.609352,49.574949,10.388834,81.699524,331.20987,7.0,809.98,19114.12,4.0,-500.0
3,0,12,128,2.0,268,0.0,3.278465,2,4194.17085,3,...,31.377862,49.574949,6.27,199.45807,223.45131,4.0,809.98,19114.12,4.0,23.0
4,7,12,128,2.0,269,0.0,4.0,2,1824.843333,3,...,24.797347,49.574949,11.27,41.420155,341.48923,30.923342,809.98,19114.12,4.0,23.0


In [4]:
df['Credit_Score'].value_counts()

1    53174
0    28998
2    17828
Name: Credit_Score, dtype: int64

# Data division

In [4]:
X = df.drop('Credit_Score', axis=1)
Y = df['Credit_Score']

In [6]:
# # Scaler data
# from sklearn.preprocessing import StandardScaler
# scaler= StandardScaler()
# scaler.fit(X)
# X = scaler.transform(X)

## Train e test

In [5]:
X = df.drop('Credit_Score', axis=1)
Y = df['Credit_Score']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Model

In [9]:
rf = RandomForestClassifier()

In [10]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [11]:
y_pred = rf.predict(X_test)

In [12]:
rf.score(X_train, y_train)

0.9999857142857143

In [13]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)))
print("Recall: ", recall_score(y_test, y_pred, average='weighted'))
print("F1", f1_score(y_test, y_pred, average='weighted'))

Accuracy:  0.7841666666666667
Precision:  0.7838332415708575
Recall:  0.7841666666666667
F1 0.7839427435311733


In [14]:
print(confusion_matrix(y_test, y_pred))

[[ 6969  1651   185]
 [ 1914 12785  1174]
 [   52  1499  3771]]


In [15]:
print(classification_report(y_test, y_pred, labels=np.unique(y_pred), zero_division=0))

              precision    recall  f1-score   support

           0       0.78      0.79      0.79      8805
           1       0.80      0.81      0.80     15873
           2       0.74      0.71      0.72      5322

    accuracy                           0.78     30000
   macro avg       0.77      0.77      0.77     30000
weighted avg       0.78      0.78      0.78     30000



# Tuning and Logging

In [18]:
import mlflow 
from sklearn.model_selection import GridSearchCV

In [21]:
rf = RandomForestClassifier()


params_grid = { 
    'n_estimators': [100, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,6,8],
    'criterion' :['gini', 'entropy']
}


grid_search = GridSearchCV(estimator=rf, 
                           param_grid=params_grid, 
                           n_jobs=-1, 
                           cv=5, 
                           scoring='f1_macro')

mlflow.set_experiment('rf-tuning')
mlflow.sklearn.autolog()

with mlflow.start_run() as run:
    grid_search.fit(X_train, y_train)

2022/08/05 14:13:38 INFO mlflow.sklearn.utils: Logging the 5 best runs, 31 runs will be omitted.


In [None]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=8, max_features='log2', n_estimators=300)