In [3]:
# Taiwan Bankruptcy Dataset - Exploration Notebook

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-whitegrid')
sns.set_palette("Set2")

# 1. Download and Load the Dataset
# If you're using Kaggle API
# !kaggle datasets download -d fedesoriano/company-bankruptcy-prediction
# If you're using KaggleHub (as in your screenshot)
# import kagglehub
# path = kagglehub.dataset_download("fedesoriano/company-bankruptcy-prediction")
# print("Path to dataset files:", path)

# Load the dataset
# Replace 'path/to/data.csv' with your actual file path
df = pd.read_csv('data/bankruptcy_data.csv')

# 2. Basic Dataset Exploration
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Check data types and missing values
print("\nData Information:")
print(df.info())

# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Check class distribution
print("\nClass Distribution:")
print(df['Bankrupt?'].value_counts())
print(df['Bankrupt?'].value_counts(normalize=True).map(lambda x: f"{x:.2%}"))

# 3. Visualize Class Imbalance
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='Bankrupt?', data=df)
plt.title('Distribution of Bankruptcy Cases', fontsize=16)
plt.xlabel('Bankrupt (1) vs Non-Bankrupt (0)', fontsize=14)
plt.ylabel('Count', fontsize=14)

# Add count labels
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'bottom', fontsize=12)

plt.tight_layout()
plt.show()

# 4. Check for Missing Values
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values found.")

# 5. Feature Analysis
# Calculate correlation with target variable
correlation_with_target = df.corrwith(df['Bankrupt?']).sort_values(ascending=False)

# Get top 10 features (excluding the target itself)
top_10_positive = correlation_with_target[1:11]
top_10_negative = correlation_with_target[-10:].iloc[::-1]

# Print top correlated features
print("\nTop 10 Positively Correlated Features:")
print(top_10_positive)
print("\nTop 10 Negatively Correlated Features:")
print(top_10_negative)

# Visualize top correlations
plt.figure(figsize=(12, 10))
plt.subplot(2, 1, 1)
top_10_positive.plot(kind='bar')
plt.title('Top 10 Positively Correlated Features with Bankruptcy', fontsize=16)
plt.ylabel('Correlation Coefficient', fontsize=14)
plt.xticks(rotation=90)

plt.subplot(2, 1, 2)
top_10_negative.plot(kind='bar')
plt.title('Top 10 Negatively Correlated Features with Bankruptcy', fontsize=16)
plt.ylabel('Correlation Coefficient', fontsize=14)
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()

# 6. Distribution of Top Features by Bankruptcy Status
# Select top features (both positive and negative correlations)
top_features = list(top_10_positive.index) + list(top_10_negative.index)
top_features = top_features[:10]  # Select first 10 for visualization

# Create box plots for each feature by bankruptcy status
plt.figure(figsize=(15, 20))
for i, feature in enumerate(top_features, 1):
    plt.subplot(5, 2, i)
    sns.boxplot(x='Bankrupt?', y=feature, data=df)
    plt.title(f'Distribution of {feature}', fontsize=12)
    plt.xlabel('Bankruptcy Status (1=Bankrupt, 0=Non-Bankrupt)', fontsize=10)
    plt.ylabel(feature, fontsize=10)

plt.tight_layout()
plt.show()

# 7. Correlation Matrix Heatmap
# Select top 15 features correlated with bankruptcy
correlation_abs = correlation_with_target.abs().sort_values(ascending=False)
top_features = list(correlation_abs[1:16].index)
top_features.append('Bankrupt?')  # Add the target variable

# Create correlation matrix for top features
correlation_matrix = df[top_features].corr()

# Create a heatmap
plt.figure(figsize=(14, 12))
mask = np.triu(correlation_matrix)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, mask=mask)
plt.title('Correlation Matrix of Top 15 Features', fontsize=16)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# 8. Feature Distributions and Outliers
plt.figure(figsize=(20, 15))
for i, feature in enumerate(top_features[:9], 1):  # First 9 features
    plt.subplot(3, 3, i)
    sns.histplot(data=df, x=feature, hue='Bankrupt?', kde=True, bins=30)
    plt.title(f'Distribution of {feature}', fontsize=12)
plt.tight_layout()
plt.show()

# Check for outliers in top features
plt.figure(figsize=(15, 10))
sns.boxplot(data=df[top_features])
plt.title('Boxplot of Top Features', fontsize=16)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# 9. Data Preparation for Modeling
# Split data into features and target
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print("Class distribution in training set:")
print(y_train.value_counts(normalize=True).map(lambda x: f"{x:.2%}"))

# 10. Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 11. Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Resampled training set shape:", X_train_resampled.shape, y_train_resampled.shape)
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts(normalize=True).map(lambda x: f"{x:.2%}"))

# 12. Model Training and Evaluation
# This section provides a basic framework for model training

# Function to evaluate model performance
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    
    print(f"--- {model_name} Performance ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC: {auc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14)
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve - {model_name}', fontsize=14)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return model, accuracy, auc

# Train Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr_model, lr_acc, lr_auc = evaluate_model(lr_model, X_train_resampled, y_train_resampled, 
                                        X_test_scaled, y_test, "Logistic Regression")

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_model, rf_acc, rf_auc = evaluate_model(rf_model, X_train_resampled, y_train_resampled, 
                                        X_test_scaled, y_test, "Random Forest")

# Train XGBoost
xgb_model = xgb.XGBClassifier(
    scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
    learning_rate=0.1, 
    n_estimators=100, 
    max_depth=4,
    random_state=42
)
xgb_model, xgb_acc, xgb_auc = evaluate_model(xgb_model, X_train_resampled, y_train_resampled, 
                                          X_test_scaled, y_test, "XGBoost")

# 13. Compare Model Performance
models = ['Logistic Regression', 'Random Forest', 'XGBoost']
accuracies = [lr_acc, rf_acc, xgb_acc]
aucs = [lr_auc, rf_auc, xgb_auc]

plt.figure(figsize=(12, 6))
x = np.arange(len(models))
width = 0.35

plt.bar(x - width/2, accuracies, width, label='Accuracy')
plt.bar(x + width/2, aucs, width, label='AUC')

plt.ylabel('Score', fontsize=14)
plt.title('Model Performance Comparison', fontsize=16)
plt.xticks(x, models, fontsize=12)
plt.ylim(0, 1.0)
plt.legend()
plt.tight_layout()
plt.show()

# 14. Feature Importance Analysis
# For Random Forest
plt.figure(figsize=(12, 8))
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
plt.title('Random Forest Feature Importance', fontsize=16)
plt.tight_layout()
plt.show()

# For XGBoost
plt.figure(figsize=(12, 8))
xgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

sns.barplot(x='importance', y='feature', data=xgb_importance.head(15))
plt.title('XGBoost Feature Importance', fontsize=16)
plt.tight_layout()
plt.show()

# 15. Save the Best Model (assuming XGBoost performed best)
best_model = xgb_model

# Save model and scaler
joblib.dump(scaler, 'models/scaler.pkl')

# Save the feature names for later use
import json
feature_names = X.columns.tolist()
with open('models/feature_names.json', 'w') as f:
    json.dump(feature_names, f)

print("Model and preprocessing components saved successfully.")



OSError: 'seaborn-whitegrid' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)