# <center>Group5 - Final Project </center>
# <center>Generalization</center>

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

import joblib

import warnings
warnings.filterwarnings('ignore')

### Load test data

In [4]:
X_test = pd.read_csv("test_data.csv")
X_test = X_test.drop(columns=["Index"], errors="ignore")

In [5]:
X_test.head()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.414323,0.481029,0.46828,0.609514,0.609514,0.998889,0.797159,0.809132,0.30329,0.781361,...,0.761704,0.001404,0.623973,0.609512,0.838286,0.27545,0.026749,0.56495,1,0.136203
1,0.497441,0.560892,0.546603,0.61066,0.61066,0.999108,0.797545,0.809431,0.303506,0.781691,...,0.815244,0.004466,0.623724,0.610658,0.842427,0.285886,0.026965,0.56587,1,0.018871
2,0.501584,0.548899,0.556721,0.606134,0.606134,0.999034,0.797427,0.80937,0.303453,0.781657,...,0.806318,0.000684,0.625387,0.606132,0.840598,0.275816,0.026793,0.565165,1,0.095511
3,0.574465,0.637375,0.61968,0.600376,0.600376,0.99903,0.797528,0.809426,0.30364,0.781691,...,0.852655,0.001718,0.624151,0.600375,0.844727,0.279977,0.026795,0.565178,1,0.028513
4,0.39336,0.456444,0.440334,0.600009,0.600009,0.9988,0.797025,0.809,0.30324,0.781206,...,0.741604,0.002545,0.623612,0.600009,0.835578,0.279901,0.026623,0.564204,1,0.028779


### Apply the same correlation filter as training

In [6]:
selected_columns = joblib.load("selected_columns.pkl")[1:]
X_test.columns = X_test.columns.str.strip()

selected_columns = [col.strip() for col in selected_columns]

# Keep only columns that exist in the test set
aligned_columns = [col for col in selected_columns if col in X_test.columns]

# Apply
X_test = X_test[aligned_columns]


### Cap outliers using IQR method

In [7]:
def cap_outliers(data):
    data_capped = data.copy()
    for column in data_capped.columns:
        Q1 = data_capped[column].quantile(0.25)
        Q3 = data_capped[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data_capped[column] = np.where(data_capped[column] < lower_bound, lower_bound, data_capped[column])
        data_capped[column] = np.where(data_capped[column] > upper_bound, upper_bound, data_capped[column])
    return data_capped

X_test = cap_outliers(X_test)


###  Standardize

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_test)

### PCA to preserve 90% variance

In [9]:
pca = PCA(n_components=26)
X_test_pca = pca.fit_transform(X_scaled)

pca_column_names = [f"PC{i}" for i in range(1, X_test_pca.shape[1] + 1)]
X_test_pca_df = pd.DataFrame(X_test_pca, columns=pca_column_names)

### Predict Subgroup (Cluster ID)

In [10]:
subgroup_model = joblib.load("subgroup_classifier.pkl")
subgroup_features = joblib.load("subgroup_classifier_features.pkl")

X_cluster_input = X_test_pca_df[subgroup_features]  # make sure this matches training
cluster_preds = subgroup_model.predict(X_test_pca_df)

X_test_pca_df["cluster_id"] = cluster_preds
X_test_pca_df.head()


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,cluster_id
0,-1.141336,-7.796876,4.017354,-0.823197,-0.639566,0.792681,0.351867,1.711497,-0.068079,1.154519,...,1.05248,-0.004427,1.567212,-0.928891,-1.749083,-0.361985,0.578492,-0.078964,0.037702,4
1,3.986363,1.408782,-5.776441,0.331195,-1.122943,3.064133,-0.093763,-0.337274,-0.143221,3.189611,...,-1.343528,-0.763911,0.099394,0.342037,0.831051,0.223473,-0.791837,-0.421348,0.619303,2
2,-4.632465,-3.982551,0.989534,-0.204711,3.200185,-0.745136,0.353099,-1.266053,0.955736,0.220723,...,0.827948,-1.335605,0.5746,0.526581,0.655698,0.813451,-0.313577,-0.122229,0.102399,4
3,-2.485136,4.479459,0.167203,0.818765,-2.136441,0.515956,-2.542878,-2.724835,1.309375,2.497525,...,0.317295,0.424798,-1.279999,-0.203705,-0.442551,0.886522,-0.728212,-0.419694,-1.654393,3
4,6.886089,-4.877716,-0.095957,2.519361,-0.536971,-1.432253,-0.145129,-0.834501,-1.301573,-0.302833,...,0.273995,0.092943,0.181608,-0.622184,-1.11774,-1.019547,-1.024492,0.614701,-1.822893,5


In [11]:
cluster_counts = X_test_pca_df["cluster_id"].value_counts().sort_index()
cluster_counts = cluster_counts.rename("count").reset_index().rename(columns={"index": "cluster_id"})
print(cluster_counts)

   cluster_id  count
0           1    189
1           2    288
2           3    185
3           4    197
4           5    153


In [12]:
import joblib
import numpy as np
import pandas as pd

# Step 1: Load the models, selectors, and full feature names
model_cluster_1 = joblib.load('cluster1_model.pkl')
selector_cluster_1 = joblib.load('cluster1_selector.pkl')
features_cluster_1 = joblib.load('cluster1_feature_names.pkl')

model_cluster_2 = joblib.load('cluster2_model.pkl')
selector_cluster_2 = joblib.load('cluster2_selector.pkl')
features_cluster_2 = joblib.load('cluster2_feature_names.pkl')

model_cluster_3 = joblib.load('cluster3_model.pkl')
selector_cluster_3 = joblib.load('cluster3_selector.pkl')
features_cluster_3 = joblib.load('cluster3_feature_names.pkl')

model_cluster_4 = joblib.load('cluster4_model.pkl')
selector_cluster_4 = joblib.load('cluster4_selector.pkl')
features_cluster_4 = joblib.load('cluster4_feature_names.pkl')

# Step 2: Mapping clusters to their (model, selector, full feature names)
cluster_models = {
    1: (model_cluster_1, selector_cluster_1, features_cluster_1),
    2: (model_cluster_2, selector_cluster_2, features_cluster_2),
    3: (model_cluster_3, selector_cluster_3, features_cluster_3),
    4: (model_cluster_4, selector_cluster_4, features_cluster_4)
}

# Step 3: Prepare test features
X_test_features = X_test_pca_df.drop(columns=['cluster_id'])
cluster_ids = X_test_pca_df['cluster_id'].values

# Step 4: Create empty array to hold predictions
final_predictions = np.zeros(len(X_test_pca_df), dtype=int)

# Step 5: Predict based on cluster
for cluster_num, (model, selector, full_features) in cluster_models.items():
    idx = np.where(cluster_ids == cluster_num)[0]

    if len(idx) > 0:
        # Subset and reorder test data columns
        X_cluster = X_test_features.iloc[idx][full_features]

        # Apply saved selector to test features
        X_cluster_selected = selector.transform(X_cluster)

        # Predict bankruptcy
        preds = model.predict(X_cluster_selected)

        # Save predictions
        final_predictions[idx] = preds

# Step 6: For cluster 5 (no bankruptcies), predict 0
idx_cluster5 = np.where(cluster_ids == 5)[0]
final_predictions[idx_cluster5] = 0

# Step 7: Build the final output DataFrame exactly like the picture
submission_df = pd.DataFrame({
    'Index': np.arange(1, len(X_test_pca_df) + 1),
    'Bankrupt?': final_predictions
})

# Step 8: Save to CSV
submission_df.to_csv('5_Generalization.csv', index=False)

print("✅ Final predictions completed and saved to '5_Generalization.csv' in correct submission format.")


✅ Final predictions completed and saved to '5_Generalization.csv' in correct submission format.


In [13]:
import pandas as pd

# Step 1: Load the dataset
df_predictions = pd.read_csv("5_Generalization.csv")  # Adjust path if needed

# Step 2: Calculations (now using 'Bankrupt?' column, not 'bankruptcy_prediction')
num_bankruptcy_companies = (df_predictions['Bankrupt?'] == 1).sum()
num_non_bankruptcy_companies = (df_predictions['Bankrupt?'] == 0).sum()
total_companies = len(df_predictions)
bankruptcy_percentage = (num_bankruptcy_companies / total_companies) * 100

# Step 3: Print output
print(f"Number of Bankruptcy companies = {num_bankruptcy_companies}")
print(f"Number of Non Bankruptcy companies = {num_non_bankruptcy_companies}")
print(f"Percentage of Bankruptcy Companies = {bankruptcy_percentage:.2f}%")
print(f"Total number of Companies = {total_companies}")


Number of Bankruptcy companies = 165
Number of Non Bankruptcy companies = 847
Percentage of Bankruptcy Companies = 16.30%
Total number of Companies = 1012
