In [4]:
#Goal: predict if a company goes bankrupt or not
%pip install imbalanced-learn
%pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
df = pd.read_csv('datasets/data.csv')

In [6]:
# Prepare data
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Define the parameter grid for Random Forest and RFE
param_grid = {
    'rfe__n_features_to_select': [5, 10, 15, 20],  # Number of features to select
    'rf__n_estimators': [100, 300, 500],  # Number of trees
    'rf__min_samples_split': [2, 5, 10],  # Minimum samples required to split a node
    'rf__min_samples_leaf': [1, 2, 4],    # Minimum samples required at leaf node
    'rf__max_depth': [None, 10, 30],      # Maximum depth of trees
    'rf__class_weight': ['balanced', {0: 10, 1: 1}, {0: 5, 1: 3}]  # Class weights
}

# Base Random Forest model for RFE
base_rf = RandomForestClassifier(random_state=42)

# Create a pipeline that applies SMOTE, RFE, then trains Random Forest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rfe', RFE(estimator=base_rf)),  # Add RFE to the pipeline
    ('rf', base_rf)  # Random Forest as the final model
])

# Use RandomizedSearchCV to optimize the pipeline
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=30,  # Number of random combinations to test
    cv=3,  # Number of cross-validation splits
    n_jobs=-1,  # Use all available CPU cores
    scoring='recall',  # Optimize for recall
    verbose=1,
    random_state=42
)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Fit the model using RandomizedSearchCV
random_search.fit(x_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print(f'Best parameters: {random_search.best_params_}')

# Get the best model found by RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(x_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 30 candidates, totalling 90 fits


KeyboardInterrupt: 