In [1]:
import os
import numpy as np
import pandas as pd

DATA_PATH = "../data/processed/financial_ratios_final_clean.csv"
REP_PATH  = "../outputs/tables/representative_indicators_by_industry_year.csv"
OUT_DIR   = "../outputs/tables"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH, encoding="utf-8-sig")
rep = pd.read_csv(REP_PATH, encoding="utf-8-sig")

# Pivot rep -> wide
REP_COL = "Chỉ số đại diện (theo PCA)"
rep_wide = (
    rep.pivot_table(
        index=["Ngành ICB - cấp 1", "Năm"],
        columns="Nhóm chỉ số",
        values=REP_COL,
        aggfunc="first"
    )
    .reset_index()
)

expected_groups = ["Liquidity", "Leverage", "Efficiency", "Profitability"]
missing_groups = [g for g in expected_groups if g not in rep_wide.columns]
if missing_groups:
    raise ValueError(f"Thiếu nhóm trong rep_wide: {missing_groups}. Kiểm tra file 03.")

df_base = df.merge(rep_wide, on=["Ngành ICB - cấp 1", "Năm"], how="inner")
print("df_base:", df_base.shape)

base_path = os.path.join(OUT_DIR, "05A_base_with_representatives.csv")
df_base.to_csv(base_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", base_path)

df_base: (1457, 24)
✅ Saved: ../outputs/tables/05A_base_with_representatives.csv


In [2]:
import os
import numpy as np
import pandas as pd

IN_PATH = "../outputs/tables/05A_base_with_representatives.csv"
OUT_DIR = "../outputs/tables"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(IN_PATH, encoding="utf-8-sig")

expected_groups = ["Liquidity", "Leverage", "Efficiency", "Profitability"]
MIN_GROUP_N = 10

bench_rows = []

for (industry, year), g in df.groupby(["Ngành ICB - cấp 1", "Năm"]):
    if g.shape[0] < MIN_GROUP_N:
        continue

    for group in expected_groups:
        ind_name = g[group].iloc[0]
        if pd.isna(ind_name) or ind_name not in df.columns:
            continue

        vals = pd.to_numeric(g[ind_name], errors="coerce")
        bench_rows.append({
            "Ngành ICB - cấp 1": industry,
            "Năm": year,
            "Nhóm chỉ số": group,
            "Indicator_Name": ind_name,
            "Benchmark_Mean": vals.mean(skipna=True),
            "n_obs": int(vals.notna().sum())
        })

bench_df = pd.DataFrame(bench_rows)
print("bench_df:", bench_df.shape)

bench_path = os.path.join(OUT_DIR, "05B_industry_year_benchmarks.csv")
bench_df.to_csv(bench_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", bench_path)

bench_df: (120, 6)
✅ Saved: ../outputs/tables/05B_industry_year_benchmarks.csv


In [3]:
import os
import numpy as np
import pandas as pd

BASE_PATH  = "../outputs/tables/05A_base_with_representatives.csv"
BENCH_PATH = "../outputs/tables/05B_industry_year_benchmarks.csv"
OUT_DIR    = "../outputs/tables"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(BASE_PATH, encoding="utf-8-sig")
bench = pd.read_csv(BENCH_PATH, encoding="utf-8-sig")

expected_groups = ["Liquidity", "Leverage", "Efficiency", "Profitability"]

# Leverage: càng thấp càng tốt
GROUP_DIRECTION = {
    "Liquidity": "higher_better",
    "Leverage": "lower_better",
    "Efficiency": "higher_better",
    "Profitability": "higher_better"
}

# tạo dict benchmark để tra nhanh
bench_key = {}
for _, r in bench.iterrows():
    bench_key[(r["Ngành ICB - cấp 1"], r["Năm"], r["Nhóm chỉ số"])] = (r["Indicator_Name"], r["Benchmark_Mean"])

META_COLS = [
    "Mã", "Tên công ty", "Sàn",
    "Ngành ICB - cấp 1", "Ngành ICB - cấp 2", "Ngành ICB - cấp 3", "Ngành ICB - cấp 4",
    "Năm"
]

rows = []
for _, row in df.iterrows():
    out = {c: row.get(c) for c in META_COLS}
    n_app, n_pass = 0, 0

    for group in expected_groups:
        k = (row["Ngành ICB - cấp 1"], row["Năm"], group)

        if k not in bench_key:
            out[f"{group}_RepIndicator"] = np.nan
            out[f"{group}_BenchmarkMean"] = np.nan
            out[f"{group}_Value"] = np.nan
            out[f"{group}_Pass"] = np.nan
            continue

        ind_name, mean_val = bench_key[k]
        val = pd.to_numeric(row.get(ind_name, np.nan), errors="coerce")

        out[f"{group}_RepIndicator"] = ind_name
        out[f"{group}_BenchmarkMean"] = mean_val
        out[f"{group}_Value"] = val

        if pd.isna(val) or pd.isna(mean_val):
            out[f"{group}_Pass"] = np.nan
            continue

        n_app += 1
        if GROUP_DIRECTION[group] == "higher_better":
            passed = int(val >= mean_val)
        else:
            passed = int(val <= mean_val)

        out[f"{group}_Pass"] = passed
        n_pass += passed

    out["Num_Applicable_Indicators"] = n_app
    out["Num_Pass_Indicators"] = n_pass
    out["Pass_Ratio"] = (n_pass / n_app) if n_app > 0 else np.nan

    rows.append(out)

df_pass = pd.DataFrame(rows)
print("df_pass:", df_pass.shape)

out_path = os.path.join(OUT_DIR, "05C_pass_matrix_full_dataset.csv")
df_pass.to_csv(out_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", out_path)

df_pass: (1457, 27)
✅ Saved: ../outputs/tables/05C_pass_matrix_full_dataset.csv


In [4]:
import os
import numpy as np
import pandas as pd

IN_PATH = "../outputs/tables/05C_pass_matrix_full_dataset.csv"
OUT_DIR = "../outputs/tables"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(IN_PATH, encoding="utf-8-sig")

def classify_state(r):
    if pd.isna(r):
        return np.nan
    if r < 0.25:
        return "High_Risk"
    elif r < 0.50:
        return "At_Risk"
    elif r < 0.75:
        return "Stable"
    else:
        return "Healthy"

df["Financial_State_Rule"] = df["Pass_Ratio"].apply(classify_state)

out_path = os.path.join(OUT_DIR, "05D_financial_state_rule_labeled.csv")
df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", out_path)

dist = (
    df["Financial_State_Rule"]
    .value_counts(dropna=False)
    .rename_axis("Financial_State_Rule")
    .reset_index(name="Count")
)
dist_path = os.path.join(OUT_DIR, "05D_financial_state_rule_distribution.csv")
dist.to_csv(dist_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", dist_path)

✅ Saved: ../outputs/tables/05D_financial_state_rule_labeled.csv
✅ Saved: ../outputs/tables/05D_financial_state_rule_distribution.csv


In [5]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score

IN_PATH = "../outputs/tables/05D_financial_state_rule_labeled.csv"
OUT_DIR = "../outputs/tables"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(IN_PATH, encoding="utf-8-sig")

expected_groups = ["Liquidity", "Leverage", "Efficiency", "Profitability"]
feature_cols = [f"{g}_Value" for g in expected_groups]

MIN_GROUP_N = 10
RANDOM_STATE = 42

# lọc dataset đủ feature + đủ rule
df_ml = df.dropna(subset=feature_cols + ["Financial_State_Rule"]).copy()

# bỏ các ngành-năm quá ít
counts = df_ml.groupby(["Ngành ICB - cấp 1", "Năm"]).size().reset_index(name="n")
valid = counts[counts["n"] >= MIN_GROUP_N][["Ngành ICB - cấp 1", "Năm"]]
df_ml = df_ml.merge(valid, on=["Ngành ICB - cấp 1", "Năm"], how="inner")

print("ML dataset:", df_ml.shape)

X = df_ml[feature_cols].astype(float).values
X_scaled = StandardScaler().fit_transform(X)

def run_suite(Xs, k_list=(3,4)):
    rows = []
    for k in k_list:
        # KMeans
        km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=20)
        y = km.fit_predict(Xs)
        rows.append({"model":"kmeans","k":k,"silhouette":silhouette_score(Xs,y),
                     "davies_bouldin":davies_bouldin_score(Xs,y)})

        # GMM
        gmm = GaussianMixture(n_components=k, random_state=RANDOM_STATE)
        y = gmm.fit_predict(Xs)
        rows.append({"model":"gmm","k":k,"silhouette":silhouette_score(Xs,y),
                     "davies_bouldin":davies_bouldin_score(Xs,y)})

        # Agglomerative
        agg = AgglomerativeClustering(n_clusters=k)
        y = agg.fit_predict(Xs)
        rows.append({"model":"agglomerative","k":k,"silhouette":silhouette_score(Xs,y),
                     "davies_bouldin":davies_bouldin_score(Xs,y)})
    return pd.DataFrame(rows)

metrics = run_suite(X_scaled, k_list=(3,4))
metrics_path = os.path.join(OUT_DIR, "05E_ml_cluster_metrics.csv")
metrics.to_csv(metrics_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", metrics_path)

# chọn best theo silhouette (tie -> DB thấp)
best = metrics.sort_values(["silhouette","davies_bouldin"], ascending=[False,True]).iloc[0]
best_model, best_k = best["model"], int(best["k"])
print("Best:", best_model, "k=", best_k)

# fit best
if best_model == "kmeans":
    model = KMeans(n_clusters=best_k, random_state=RANDOM_STATE, n_init=20)
    cluster = model.fit_predict(X_scaled)
elif best_model == "gmm":
    model = GaussianMixture(n_components=best_k, random_state=RANDOM_STATE)
    cluster = model.fit_predict(X_scaled)
else:
    model = AgglomerativeClustering(n_clusters=best_k)
    cluster = model.fit_predict(X_scaled)

df_ml_out = df_ml.copy()
df_ml_out["Cluster_Label"] = cluster

out_path = os.path.join(OUT_DIR, "05E_ml_cluster_labels.csv")
df_ml_out.to_csv(out_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", out_path)

# crosstab rule vs cluster
ct = pd.crosstab(df_ml_out["Financial_State_Rule"], df_ml_out["Cluster_Label"], normalize="index").round(3)
ct_path = os.path.join(OUT_DIR, "05E_rule_vs_cluster_crosstab.csv")
ct.to_csv(ct_path, encoding="utf-8-sig")
print("✅ Saved:", ct_path)

# (tuỳ chọn) ARI tham khảo
rule_map = {"High_Risk":0,"At_Risk":1,"Stable":2,"Healthy":3}
y_rule = df_ml_out["Financial_State_Rule"].map(rule_map).astype(int).values
ari = adjusted_rand_score(y_rule, cluster)

ari_df = pd.DataFrame([{"best_model":best_model,"k":best_k,"ARI_rule_vs_cluster":ari}])
ari_path = os.path.join(OUT_DIR, "05E_rule_vs_cluster_ari.csv")
ari_df.to_csv(ari_path, index=False, encoding="utf-8-sig")
print("✅ Saved:", ari_path)

ML dataset: (1416, 28)
✅ Saved: ../outputs/tables/05E_ml_cluster_metrics.csv
Best: kmeans k= 3
✅ Saved: ../outputs/tables/05E_ml_cluster_labels.csv
✅ Saved: ../outputs/tables/05E_rule_vs_cluster_crosstab.csv
✅ Saved: ../outputs/tables/05E_rule_vs_cluster_ari.csv


In [6]:
import os
import pandas as pd
import numpy as np

In [7]:
OUT_DIR = "../outputs/tables"

metrics = pd.read_csv(
    os.path.join(OUT_DIR, "05E_ml_cluster_metrics.csv"),
    encoding="utf-8-sig"
)

clusters = pd.read_csv(
    os.path.join(OUT_DIR, "05E_ml_cluster_labels.csv"),
    encoding="utf-8-sig"
)

crosstab = pd.read_csv(
    os.path.join(OUT_DIR, "05E_rule_vs_cluster_crosstab.csv"),
    encoding="utf-8-sig",
    index_col=0
)

ari_df = pd.read_csv(
    os.path.join(OUT_DIR, "05E_rule_vs_cluster_ari.csv"),
    encoding="utf-8-sig"
)

print("Metrics:", metrics.shape)
print("Clusters:", clusters.shape)
print("Crosstab:", crosstab.shape)
print("ARI:", ari_df)

Metrics: (6, 4)
Clusters: (1416, 29)
Crosstab: (4, 3)
ARI:   best_model  k  ARI_rule_vs_cluster
0     kmeans  3            -0.000274


In [8]:
metrics_sorted = metrics.sort_values(
    ["silhouette", "davies_bouldin"],
    ascending=[False, True]
).reset_index(drop=True)

print("=== All model metrics (sorted) ===")
display(metrics_sorted)

best = metrics_sorted.iloc[0]
print("\n✅ Best selection:")
print(f"model={best['model']}, k={int(best['k'])}, silhouette={best['silhouette']:.4f}, dbi={best['davies_bouldin']:.4f}")

=== All model metrics (sorted) ===


Unnamed: 0,model,k,silhouette,davies_bouldin
0,kmeans,3,0.956664,0.026614
1,agglomerative,3,0.927972,0.428596
2,kmeans,4,0.780211,0.48567
3,agglomerative,4,0.731161,0.554876
4,gmm,4,0.483555,0.723231
5,gmm,3,0.418336,1.418165



✅ Best selection:
model=kmeans, k=3, silhouette=0.9567, dbi=0.0266


In [9]:
cluster_dist = (
    clusters["Cluster_Label"]
    .value_counts(normalize=True)
    .rename("Proportion")
    .reset_index()
)

cluster_dist.columns = ["Cluster", "Proportion"]

cluster_dist = cluster_dist.sort_values("Cluster")

print("=== ML Cluster Distribution ===")
display(cluster_dist)

=== ML Cluster Distribution ===


Unnamed: 0,Cluster,Proportion
0,0,0.998588
1,1,0.000706
2,2,0.000706


In [10]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# dùng đúng dataset đã label
df_check = clusters.dropna(subset=["Pass_Ratio"]).copy()

X = df_check[["Pass_Ratio"]].values
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=20)
df_check["Cluster_PR"] = kmeans.fit_predict(X_scaled)

# phân bố cluster mới
cluster_pr_dist = (
    df_check["Cluster_PR"]
    .value_counts(normalize=True)
    .rename("Proportion")
    .reset_index()
)

cluster_pr_dist.columns = ["Cluster_PR", "Proportion"]
cluster_pr_dist = cluster_pr_dist.sort_values("Cluster_PR")

print("=== Cluster distribution (Pass_Ratio only) ===")
display(cluster_pr_dist)

=== Cluster distribution (Pass_Ratio only) ===


Unnamed: 0,Cluster_PR,Proportion
0,0,0.491525
1,1,0.300141
2,2,0.208333


In [11]:
ct_pr = (
    pd.crosstab(
        df_check["Financial_State_Rule"],
        df_check["Cluster_PR"],
        normalize="index"
    )
    .round(3)
)

print("=== Crosstab: Financial_State_Rule vs Cluster_PR ===")
display(ct_pr)

=== Crosstab: Financial_State_Rule vs Cluster_PR ===


Cluster_PR,0,1,2
Financial_State_Rule,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
At_Risk,1.0,0.0,0.0
Healthy,0.0,0.0,1.0
High_Risk,1.0,0.0,0.0
Stable,0.0,1.0,0.0
