In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ----- Load your data -----
df = pd.read_csv('bankruptcy_data.csv')

X = df.drop(['Bankrupt?', ' Net Income Flag', ' Liability-Assets Flag'], axis=1)
Y = df['Bankrupt?']

# ----- Select numeric columns only -----
numeric_df = df.select_dtypes(include=[np.number])

# ========================================
# 🔹 1. Filtered Correlation Report
# ========================================
def find_high_correlations(data, threshold=0.8):
    corr_matrix = data.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    high_corr_pairs = (
        upper.stack()
        .reset_index()
        .rename(columns={0: 'correlation', 'level_0': 'feature_1', 'level_1': 'feature_2'})
    )
    
    return high_corr_pairs[high_corr_pairs['correlation'] > threshold].sort_values(by='correlation', ascending=False)

high_corrs = find_high_correlations(numeric_df, threshold=0.8)

print("\n🔗 Highly Correlated Feature Pairs (|r| > 0.8):")
print(high_corrs if not high_corrs.empty else "None found.")





🔗 Highly Correlated Feature Pairs (|r| > 0.8):
                                              feature_1  \
4012                      Current Liabilities/Liability   
4070                         Current Liabilities/Equity   
2812                                       Debt ratio %   
454                              Operating Gross Margin   
1462                            Net Value Per Share (A)   
370                              Operating Gross Margin   
543                         Realized Sales Gross Margin   
1384                            Net Value Per Share (B)   
1385                            Net Value Per Share (B)   
1856                Operating Profit Per Share (Yuan ¥)   
2119                   After-tax Net Profit Growth Rate   
639                           Pre-tax net Interest Rate   
95     ROA(C) before interest and depreciation befor...   
637                           Pre-tax net Interest Rate   
725                         After-tax net Interest Rate   
4083    

In [None]:
# ========================================
# 🔹 2. IQR Outlier Detection (with % reporting)
# ========================================
def detect_outliers_iqr(data):
    outlier_flags = pd.DataFrame(index=data.index)
    
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        is_outlier = (data[col] < lower_bound) | (data[col] > upper_bound)
        outlier_flags[col + "_outlier_percentage"] = is_outlier
    
    outlier_percentages = outlier_flags.mean().sort_values(ascending=False) * 100
    return outlier_flags, outlier_percentages

outlier_flags, outlier_percentages = detect_outliers_iqr(numeric_df)

print("\n🚨 Percentage of Rows with Outliers (IQR Method):")
print(outlier_percentages[outlier_percentages > 0].round(2).astype(str) + " %")

# Optional: Flag rows with any outlier
numeric_df['any_outlier'] = outlier_flags.any(axis=1)

# Optional: View rows with any outlier
outlier_rows = df[numeric_df['any_outlier']]
print(f"\nFound {len(outlier_rows)} rows ({(len(outlier_rows) / len(df)) * 100:.2f}%) with at least one outlier.")

In [11]:
# ========================================
# 🔹 3. Class Distribution
# ========================================
def display_class_distribution(data):
    dist = data.value_counts(normalize=True).mul(100).round(2)
    print(f"\n Class Distribution")
    print(dist.astype(str) + " %")

# ----- Replace 'target' with your actual target column name -----
display_class_distribution(Y)


 Class Distribution for :
Bankrupt?
0    96.77 %
1     3.23 %
Name: proportion, dtype: object
