In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
DATA_PATH = "../data/processed/financial_ratios_final_clean.csv"
OUT_DIR = "../outputs/tables"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH, encoding="utf-8-sig")
print("Loaded:", df.shape)

Loaded: (1469, 20)


In [3]:
RATIO_GROUPS = {
    "Liquidity": [
        "Current_Ratio",
        "Quick_Ratio",
        "Cash_Ratio"
    ],
    "Leverage": [
        "Debt_Equity",
        "Net_Leverage"
    ],
    "Efficiency": [
        "Asset_Turnover",
        "Fixed_Asset_Turnover"
    ],
    "Profitability": [
        "ROA",
        "ROE",
        "ROS"
    ]
}

In [4]:
def run_pca_block(df_block, ratio_cols, explained_threshold=0.8):
    """
    Chạy PCA để đánh giá cấu trúc thông tin.
    Trả về:
    - explained variance
    - loadings
    - số PC cần để đạt explained_threshold
    """

    X = df_block[ratio_cols].dropna()
    if X.shape[0] < 5:
        return None  # quá ít quan sát → bỏ

    X_scaled = StandardScaler().fit_transform(X)

    pca = PCA()
    pca.fit(X_scaled)

    explained = pca.explained_variance_ratio_
    cum_explained = np.cumsum(explained)

    n_components = int(np.argmax(cum_explained >= explained_threshold) + 1)

    loadings = pd.DataFrame(
        pca.components_.T,
        index=ratio_cols,
        columns=[f"PC{i+1}" for i in range(len(ratio_cols))]
    )

    return {
        "n_obs": X.shape[0],
        "explained_variance": explained,
        "cum_explained": cum_explained,
        "n_components_80pct": n_components,
        "loadings": loadings
    }

In [5]:
records = []
representative_indicators = []

for (industry, year), df_sub in df.groupby(["Ngành ICB - cấp 1", "Năm"]):

    for group_name, cols in RATIO_GROUPS.items():

        result = run_pca_block(df_sub, cols)

        if result is None:
            continue

        # Lấy PC1
        pc1_loadings = result["loadings"]["PC1"].abs()

        # Chỉ số đại diện = loading lớn nhất trên PC1
        rep_indicator = pc1_loadings.idxmax()

        records.append({
            "Ngành ICB - cấp 1": industry,
            "Năm": year,
            "Nhóm chỉ số": group_name,
            "Số quan sát": result["n_obs"],
            "Số PC đạt 80% thông tin": result["n_components_80pct"],
            "Tỷ lệ phương sai PC1": round(result["explained_variance"][0], 3)
        })

        representative_indicators.append({
            "Ngành ICB - cấp 1": industry,
            "Năm": year,
            "Nhóm chỉ số": group_name,
            "Chỉ số đại diện (theo PCA)": rep_indicator
        })

In [6]:
pca_summary_df = pd.DataFrame(records)
pca_summary_df.sort_values(
    ["Ngành ICB - cấp 1", "Năm", "Nhóm chỉ số"],
    inplace=True
)

pca_summary_path = os.path.join(OUT_DIR, "pca_structure_by_industry_year.csv")
pca_summary_df.to_csv(pca_summary_path, index=False, encoding="utf-8-sig")

print("Saved:", pca_summary_path)

Saved: ../outputs/tables/pca_structure_by_industry_year.csv


In [7]:
rep_indicator_df = pd.DataFrame(representative_indicators)

rep_path = os.path.join(
    OUT_DIR,
    "representative_indicators_by_industry_year.csv"
)
rep_indicator_df.to_csv(rep_path, index=False, encoding="utf-8-sig")

print("Saved:", rep_path)

Saved: ../outputs/tables/representative_indicators_by_industry_year.csv
