In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Cleaning the data

In [None]:
df = pd.read_csv('cs-training.csv')
df = df.rename(columns={'SeriousDlqin2yrs': 'label'})
df = df.dropna()
df = df.drop(columns=['Unnamed: 0'])

print(len(df))
print(df.head(5))

### Random forest

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer, confusion_matrix

features_count = len(df.columns) - 1
X = df.drop(columns=['label'], axis=1).to_numpy()
y = df['label']

X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, test_size=0.30, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, test_size=0.50, random_state=1)

base_rf = RandomForestClassifier(class_weight = 'balanced', n_estimators=300, max_depth=10, max_features='sqrt', min_samples_leaf=2, random_state=1)
param_grid = {'class_weight': ['balanced', {0:1, 1:2}, {0:1, 1:3}, {0:1, 1:5}, {0:1, 1:8}, {0:1, 1:10}], 'max_depth': [10, 20, None], 'min_samples_leaf': [1, 2, 4]}
f1_scorer = make_scorer(f1_score)
grid = GridSearchCV(estimator=base_rf, param_grid=param_grid, scoring=f1_scorer, cv=3, verbose=2, n_jobs=-1)
grid.fit(X_train, y_train)

print("\nBest parameters found:")
print(grid.best_params_)
print(f"Best mean F1 score (CV): {grid.best_score_:.4f}")

best_rf = grid.best_estimator_
y_val_pred = best_rf.predict(X_val)

print(f"\nAccuracy score for validation set: {accuracy_score(y_val, y_val_pred):.2f}")
print(f"Precision score for validation set: {precision_score(y_val, y_val_pred):.2f}")
print(f"Recall score for validation set: {recall_score(y_val, y_val_pred):.2f}")
print(f"F1 score for validation set: {f1_score(y_val, y_val_pred):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

final_rf = RandomForestClassifier(n_estimators=800, max_features='sqrt', class_weight=grid.best_params_['class_weight'], max_depth=grid.best_params_['max_depth'], min_samples_leaf=grid.best_params_['min_samples_leaf'], random_state=1)
X_trainval = np.concatenate((X_train, X_val))
y_ftrainval = np.concatenate((y_train, y_val))
final_rf.fit(X_trainval, y_ftrainval)
y_test_pred = final_rf.predict(X_test)


In [None]:
print(f"Accuracy score for test set: {accuracy_score(y_test, y_test_pred):.2f}")
print(f"Precision score for test set: {precision_score(y_test, y_test_pred):.2f}")
print(f"Recall score for test set: {recall_score(y_test, y_test_pred):.2f}")
print(f"F1 score for test set: {f1_score(y_test, y_test_pred):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

### Vizualize tha data or sumn

### Logistic regression