In [None]:
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:
def load_sms():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
    urllib.request.urlretrieve(url, "smsspamcollection.zip")
    with zipfile.ZipFile("smsspamcollection.zip") as zf:
        with zf.open("SMSSpamCollection") as f:
            df = pd.read_csv(f, sep="\t", header=None, names=["label", "text"])
    return df

df1=load_sms()
df1.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import pandas as pd
import re
import zipfile
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
DATAFILE = '/content/drive/MyDrive/TRP/bbc-text.csv'

df = pd.read_csv(DATAFILE)

df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"http\S+|www\.\S+", " URL ", s)
    s = re.sub(r"\d+", " NUM ", s)
    s = re.sub(r"[^\w\s']", " ", s)
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s

# ========== import数据集（BBC 新闻数据集） ==========
df = pd.read_csv(DATAFILE)

df["text"] = df["text"].apply(clean_text)
print("数据集大小:", df.shape)
print(df.head())

# ========== 训练/测试集划分 ==========
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["category"])
print("Train size:", train_df.shape, "Test size:", test_df.shape)

# ========== 标签编码 ==========
le = LabelEncoder()
y_train = le.fit_transform(train_df["category"])
y_test = le.transform(test_df["category"])

# ========== TF-IDF 特征 ==========
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2), stop_words="english")
X_train = vectorizer.fit_transform(train_df["text"])
X_test = vectorizer.transform(test_df["text"])

print("X_train 维度:", X_train.shape)
print("X_test 维度:", X_test.shape)
print("标签类别:", le.classes_)

数据集大小: (2225, 2)
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss left books alone former worldcom...
2          sport  tigers wary of farrell gamble leicester say th...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...
Train size: (1780, 2) Test size: (445, 2)
X_train 维度: (1780, 2000)
X_test 维度: (445, 2000)
标签类别: ['business' 'entertainment' 'politics' 'sport' 'tech']


In [None]:
# Week 4–5: Manual Model Tuning with GridSearchCV (LogReg / LinearSVC / RandomForest)
# It uses X_train, X_test, y_train, y_test from your previous preprocessing cell.

import time, os, json
import numpy as np
from pprint import pprint
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.dummy import DummyClassifier
import joblib

# --- safety check ---
for var in ["X_train","X_test","y_train","y_test"]:
    if var not in globals():
        raise RuntimeError(f"Variable `{var}` not found. Please run the preprocessing cell first.")

os.makedirs("models_manual", exist_ok=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = "f1_macro"   # macro F1 is good for class imbalance

# --- baselines (always include a simple baseline) ---
baselines = {
    "DummyMostFrequent": DummyClassifier(strategy="most_frequent"),
    "DummyStratified":   DummyClassifier(strategy="stratified"),
}

print("=== Training baselines ===")
baseline_results = {}
for name, clf in baselines.items():
    t0 = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    baseline_results[name] = {"accuracy": acc, "f1_macro": f1m, "time_sec": time.time()-t0}
pprint(baseline_results)
print()

# --- model grids ---
model_grids = {
    "LogisticRegression": {
        "estimator": LogisticRegression(max_iter=500, class_weight="balanced", n_jobs=-1),
        "param_grid": {
            "C": [0.1, 0.3, 1, 3, 10],
            "penalty": ["l2"],           # liblinear/saga support l1; but with TF-IDF l2 is a solid start
            "solver": ["liblinear", "saga"]
        }
    },
    "LinearSVC": {
        "estimator": LinearSVC(class_weight="balanced"),
        "param_grid": {
            "C": [0.1, 0.3, 1, 3, 10],
            "loss": ["hinge", "squared_hinge"]
        }
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(class_weight="balanced", n_jobs=-1, random_state=42),
        "param_grid": {
            "n_estimators": [200, 400],
            "max_depth": [None, 20, 40],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 2]
        }
    }
}

def run_grid(name, estimator, param_grid):
    print(f"\n=== GridSearchCV: {name} ===")
    gs = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    t0 = time.time()
    gs.fit(X_train, y_train)
    dur = time.time() - t0
    print(f"Best CV {scoring}: {gs.best_score_:.4f}")
    print(f"Best Params: {gs.best_params_}")
    # evaluate on test
    best = gs.best_estimator_
    y_pred = best.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average="macro")
    print(f"Test Accuracy: {acc:.4f}")
    print(f"Test Macro-F1: {f1m:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    # save model + metadata
    model_path = f"models_manual/{name}_best.joblib"
    joblib.dump(best, model_path)
    meta = {
        "model": name,
        "best_params": gs.best_params_,
        "cv_best_score": float(gs.best_score_),
        "test_accuracy": float(acc),
        "test_f1_macro": float(f1m),
        "fit_time_sec": float(dur),
        "scoring": scoring
    }
    with open(f"models_manual/{name}_metrics.json", "w") as f:
        json.dump(meta, f, indent=2)
    return meta

all_results = {"baselines": baseline_results, "models": {}}
for name, spec in model_grids.items():
    meta = run_grid(name, spec["estimator"], spec["param_grid"])
    all_results["models"][name] = meta

print("\n=== Summary (Test set) ===")
summary_rows = []
# baselines
for k,v in baseline_results.items():
    summary_rows.append([k, "-", "-", v["accuracy"], v["f1_macro"], v["time_sec"]])
# tuned models
for k,v in all_results["models"].items():
    summary_rows.append([k, v["cv_best_score"], v["best_params"], v["test_accuracy"], v["test_f1_macro"], v["fit_time_sec"]])

import pandas as pd
summary_df = pd.DataFrame(summary_rows, columns=["Model","CV best (macro-F1)","Best Params","Test Acc","Test Macro-F1","Fit Time (s)"])
display(summary_df.sort_values(by=["Test Macro-F1","Test Acc"], ascending=False).reset_index(drop=True))

# Save a CSV summary for your report
summary_df.to_csv("models_manual/summary_manual_tuning.csv", index=False)
print("\nArtifacts saved in: models_manual/")


=== Training baselines ===
{'DummyMostFrequent': {'accuracy': 0.2292134831460674,
                       'f1_macro': 0.07458866544789763,
                       'time_sec': 0.008568525314331055},
 'DummyStratified': {'accuracy': 0.21348314606741572,
                     'f1_macro': 0.20375386852327906,
                     'time_sec': 0.0031282901763916016}}


=== GridSearchCV: LogisticRegression ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits




Best CV f1_macro: 0.9728
Best Params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Test Accuracy: 0.9865
Test Macro-F1: 0.9861

Classification Report:
              precision    recall  f1-score   support

           0     0.9804    0.9804    0.9804       102
           1     0.9747    1.0000    0.9872        77
           2     0.9765    0.9881    0.9822        84
           3     1.0000    1.0000    1.0000       102
           4     1.0000    0.9625    0.9809        80

    accuracy                         0.9865       445
   macro avg     0.9863    0.9862    0.9861       445
weighted avg     0.9867    0.9865    0.9865       445

Confusion Matrix:
 [[100   0   2   0   0]
 [  0  77   0   0   0]
 [  0   1  83   0   0]
 [  0   0   0 102   0]
 [  2   1   0   0  77]]

=== GridSearchCV: LinearSVC ===
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best CV f1_macro: 0.9746
Best Params: {'C': 0.3, 'loss': 'squared_hinge'}
Test Accuracy: 0.9820
Test Macro-F1: 0.9813

Classif

Unnamed: 0,Model,CV best (macro-F1),Best Params,Test Acc,Test Macro-F1,Fit Time (s)
0,LogisticRegression,0.972776,"{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}",0.986517,0.986142,14.482339
1,LinearSVC,0.974574,"{'C': 0.3, 'loss': 'squared_hinge'}",0.982022,0.981348,5.878099
2,RandomForest,0.9552,"{'max_depth': 40, 'min_samples_leaf': 1, 'min_...",0.973034,0.973225,379.803567
3,DummyStratified,-,-,0.213483,0.203754,0.003128
4,DummyMostFrequent,-,-,0.229213,0.074589,0.008569



Artifacts saved in: models_manual/


In [None]:
# ===== Install & init H2O =====
!pip install -U h2o

import h2o, time, json, os
from h2o.automl import H2OAutoML
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Safety check
for var in ["X_train","X_test","y_train","y_test","vectorizer","le"]:
    if var not in globals():
        raise RuntimeError(
            "Need X_train, X_test, y_train, y_test, vectorizer, le. "
            "Please run your preprocessing cell (it defines vectorizer & label encoder)."
        )

h2o.init(max_mem_size="8G")
os.makedirs("models_automl_h2o", exist_ok=True)

# 将稀疏 TF-IDF 转成稠密（H2O 需要 DataFrame；对 2k~10k 特征仍可控，若 OOM 可降低 max_features）
def to_dense_df(X, y, prefix="f"):
    Xd = X.toarray() if hasattr(X, "toarray") else np.asarray(X)
    cols = [f"{prefix}{i}" for i in range(Xd.shape[1])]
    df = pd.DataFrame(Xd, columns=cols)
    df["label"] = y
    return df

train_pd = to_dense_df(X_train, y_train)
test_pd  = to_dense_df(X_test, y_test)

# 转成 H2OFrame，并把标签设为因子（分类）
train_h2o = h2o.H2OFrame(train_pd)
test_h2o  = h2o.H2OFrame(test_pd)
train_h2o["label"] = train_h2o["label"].asfactor()
test_h2o["label"]  = test_h2o["label"].asfactor()

x = [c for c in train_h2o.columns if c != "label"]
y = "label"

# 训练 H2O AutoML（时间可调整）
aml = H2OAutoML(
    max_runtime_secs=900,          # 总时间
    max_models=25,
    seed=42,
    sort_metric="mean_per_class_error",  # 更适合类不均衡；也可用 "AUC"、"logloss"
    balance_classes=True
)
t0 = time.time()
aml.train(x=x, y=y, training_frame=train_h2o)
fit_time = time.time() - t0

# 排行榜
lb = aml.leaderboard
lb.head(rows=10)

# 评估测试集
leader = aml.leader
pred = leader.predict(test_h2o).as_data_frame()["predict"].values.astype(int)   # 预测的编码标签
acc = accuracy_score(test_pd["label"], pred)
f1m = f1_score(test_pd["label"], pred, average="macro")
print(f"H2O AutoML Test Accuracy: {acc:.4f}")
print(f"H2O AutoML Test Macro-F1: {f1m:.4f}\n")
print("Classification Report:\n", classification_report(test_pd["label"], pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(test_pd["label"], pred))

# 保存模型与指标
model_path = h2o.save_model(model=leader, path="models_automl_h2o", force=True)
with open("models_automl_h2o/h2o_metrics.json", "w") as f:
    json.dump({
        "test_accuracy": float(acc),
        "test_macro_f1": float(f1m),
        "fit_time_sec": float(fit_time),
    }, f, indent=2)

print("\nArtifacts saved in: models_automl_h2o/")
print("Saved leader model:", model_path)


Collecting h2o
  Downloading h2o-3.46.0.7-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading h2o-3.46.0.7-py2.py3-none-any.whl (265.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.9/265.9 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h2o
Successfully installed h2o-3.46.0.7
Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.28" 2025-07-15; OpenJDK Runtime Environment (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1); OpenJDK 64-Bit Server VM (build 11.0.28+6-post-Ubuntu-1ubuntu122.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.12/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp94p1nuam
  JVM stdout: /tmp/tmp94p1nuam/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp94p1nuam/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connec

0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,4 months and 29 days
H2O_cluster_name:,H2O_from_python_unknownUser_w29uuz
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,8 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
H2O AutoML Test Accuracy: 0.9820
H2O AutoML Test Macro-F1: 0.9807

Classification Report:
               precision    recall  f1-score   support

           0     0.9902    0.9902    0.9902       102
           1     0.9506    1.0000    0.9747        77
           2     0.9875    0.9405    0.9634        84
           3     1.0000    1.0000    1.0000       102
           4     0.9750    0.9750    0.9750        80

    accuracy                         0.9820       445
   macro avg     0.9807    0.9811    0.9807       445
weighted avg     0.9824    0.9820    0.9820       445

Confusion Matrix:
 [[101   0   1   0   0




In [None]:
!pip install shap==0.46.0 lime==0.2.0.1
import numpy as np, pandas as pd, joblib, shap, matplotlib.pyplot as plt
from sklearn.metrics import classification_report


Collecting shap==0.46.0
  Downloading shap-0.46.0-cp312-cp312-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting lime==0.2.0.1
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading shap-0.46.0-cp312-cp312-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (543 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m543.9/543.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=967b0d42cd919c4b9929382ce1dda7f306c1f9acb69a73a881842804d1f06896
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468f

In [None]:
# 任选一个最佳模型来做解释（建议先用 Logistic Regression，解释性最好）
# A) 手动调参保存的模型（示例：LogisticRegression）
logreg_path = "models_manual/LogisticRegression_best.joblib"  # 如不存在请换你实际保存的文件名
svm_path    = "models_manual/LinearSVC_best.joblib"           # 线性 SVM（没有概率输出）
rf_path     = "models_manual/RandomForest_best.joblib"

try:
    clf = joblib.load(logreg_path)
    model_name = "LogisticRegression"
except:
    # 兜底：如果没有逻辑回归，就尝试 SVM；（注意：SVM 需校准才能用于 LIME/SHAP 概率）
    clf = joblib.load(svm_path)
    model_name = "LinearSVC"

print("Loaded model:", model_name)


Loaded model: LogisticRegression


In [None]:
# TF-IDF 特征名（词或 n-gram）
feature_names = np.array(vectorizer.get_feature_names_out())

def top_global_coefficients_linear(clf, k=25, positive=True):
    """针对线性模型（LogReg/LinearSVC），查看全局 top-K 正/负重要特征"""
    # 兼容二分类与多分类
    coefs = getattr(clf, "coef_", None)
    if coefs is None:
        raise ValueError("Model has no `coef_` (not linear?)")
    # 对于二分类，coefs[0]；对于多分类，分别查看每一行
    if coefs.shape[0] == 1:
        w = coefs[0]
        idx = np.argsort(w)[-k:] if positive else np.argsort(w)[:k]
        return pd.DataFrame({
            "feature": feature_names[idx],
            "weight": w[idx]
        }).sort_values("weight", ascending=not positive)
    else:
        frames = []
        for c, w in enumerate(coefs):
            idx = np.argsort(w)[-k:] if positive else np.argsort(w)[:k]
            frames.append(pd.DataFrame({
                "class": c,
                "feature": feature_names[idx],
                "weight": w[idx]
            }).sort_values("weight", ascending=not positive))
        return pd.concat(frames, ignore_index=True)

# 示例：看全局 top 词（若是 SVM/LogReg 都可）
try:
    display(top_global_coefficients_linear(clf, k=20, positive=True).head(20))
    display(top_global_coefficients_linear(clf, k=20, positive=False).head(20))
except Exception as e:
    print("Global coefficient inspection skipped:", e)


Unnamed: 0,class,feature,weight
0,0,bank,5.313039
1,0,shares,4.56278
2,0,firm,4.44755
3,0,economic,4.39853
4,0,num bn,3.917511
5,0,bn,3.917511
6,0,sales,3.758707
7,0,company,3.683781
8,0,market,3.599082
9,0,investment,3.594734


Unnamed: 0,class,feature,weight
0,0,film,-3.055366
1,0,music,-3.011099
2,0,britain,-2.759545
3,0,people,-2.586987
4,0,straw,-2.562815
5,0,game,-2.374936
6,0,digital,-2.273982
7,0,party,-2.151739
8,0,blair,-2.112385
9,0,brown,-2.100851


In [None]:
!pip install shap==0.46.0
import shap, numpy as np, matplotlib.pyplot as plt

# 1) 原函数：从原始文本到概率
def predict_proba_on_text(texts):
    X = vectorizer.transform(texts)
    if hasattr(clf, "predict_proba"):
        return clf.predict_proba(X)
    if hasattr(clf, "decision_function"):
        df = clf.decision_function(X)
        if df.ndim == 1:  # 二分类
            p1 = 1 / (1 + np.exp(-df))
            return np.vstack([1 - p1, p1]).T
        else:             # 多分类 softmax 近似
            ex = np.exp(df - df.max(axis=1, keepdims=True))
            return ex / ex.sum(axis=1, keepdims=True)
    raise ValueError("Model needs predict_proba or decision_function.")

# 2) 准备原始测试文本
assert 'test_df' in globals() and 'text' in test_df.columns
text_test = test_df['text'].tolist()
assert len(text_test) > 0

# 3) 选择合适的 link（注意是 callable）
n_classes = len(le.classes_) if 'le' in globals() else (
    predict_proba_on_text(text_test[:2]).shape[1]
)
link_fn = shap.links.logit if n_classes == 2 else shap.links.identity

# 4) 文本 Masker + Explainer（algorithm 用 permutation 更稳）
masker = shap.maskers.Text()
class_names = list(le.classes_) if 'le' in globals() else None

explainer = shap.Explainer(
    predict_proba_on_text,
    masker=masker,
    output_names=class_names,
    algorithm="permutation",
    link=link_fn,   # 关键修改：传 callable，而非字符串
)

# 5) 取少量样本做解释
N_SAMPLES = 8
expl_texts = text_test[:N_SAMPLES]
shap_values = explainer(expl_texts, max_evals=2000)

# 6) 可视化第 i 条
i = 0
print("Sample:", expl_texts[i])
shap.plots.text(shap_values[i])
plt.show()




PermutationExplainer explainer: 9it [00:15,  3.11s/it]


Sample: profits jump at china s top bank industrial and commercial bank icbc china s biggest lender has seen an NUM jump in profits during NUM the increase in earnings has allowed the firm to write off bad loans and pave the way for a state bailout and eventual stock market listing china is trying to clean up its banking system which is weighed down by billions of dollars of unpaid loans it has already pumped NUM bn NUM bn into two of its largest banks and has identified icbc as a recipient of aid icbc s profits were NUM NUM bn yuan NUM bn NUM NUM bn in NUM the bank said in a statement the percentage of non performing loans dropped to NUM NUM down about NUM percentage points icbc was founded in NUM and had total assets of NUM NUM trillion yuan at the end of NUM china committed to gradually opening up its banking sector when it joined the world trade organisation in NUM


In [None]:
# 对于线性模型，SHAP 的线性解释可近似等于系数 * 特征值的贡献
# “全局 top 词”的 SHAP 风格条形图：实质上是基于系数权重的可视化
try:
    coef = clf.coef_
    if coef.shape[0] == 1:
        w = coef[0]
        idx = np.argsort(np.abs(w))[-25:]
        plt.figure(figsize=(7,6))
        plt.barh(feature_names[idx], w[idx])
        plt.title(f"Global weights (approx. SHAP) – {model_name}")
        plt.tight_layout()
        plt.show()
except Exception as e:
    print("Linear global plot skipped:", e)


In [None]:
from lime.lime_text import LimeTextExplainer

# 类别名（可用标签编码器恢复原始名；这里用编码值/或你自己的 class_names）
try:
    class_names = list(le.classes_)   # 若在预处理中有 LabelEncoder
except:
    # 兜底
    class_names = ["class_0","class_1"]

explainer = LimeTextExplainer(class_names=class_names)

i = 0  # 解释第 i 条测试样本
sample_text = text_test[i]
print("Sample:", sample_text)

# 对 LIME，必须传入 raw text -> proba 的函数
exp = explainer.explain_instance(
    sample_text,
    predict_proba_on_text,
    num_features=10
)
exp.show_in_notebook(text=sample_text)
# 保存为 HTML：
# exp.save_to_file("lime_explanation_sample.html")


Sample: profits jump at china s top bank industrial and commercial bank icbc china s biggest lender has seen an NUM jump in profits during NUM the increase in earnings has allowed the firm to write off bad loans and pave the way for a state bailout and eventual stock market listing china is trying to clean up its banking system which is weighed down by billions of dollars of unpaid loans it has already pumped NUM bn NUM bn into two of its largest banks and has identified icbc as a recipient of aid icbc s profits were NUM NUM bn yuan NUM bn NUM NUM bn in NUM the bank said in a statement the percentage of non performing loans dropped to NUM NUM down about NUM percentage points icbc was founded in NUM and had total assets of NUM NUM trillion yuan at the end of NUM china committed to gradually opening up its banking sector when it joined the world trade organisation in NUM


In [None]:
import numpy as np, pandas as pd

# 1) 统一得到概率与预测
if hasattr(clf, "predict_proba"):
    proba = clf.predict_proba(X_test)
elif hasattr(clf, "decision_function"):
    df_score = clf.decision_function(X_test)
    if df_score.ndim == 1:  # 二分类
        p1 = 1 / (1 + np.exp(-df_score))
        proba = np.vstack([1 - p1, p1]).T
    else:  # 多分类 softmax 近似
        ex = np.exp(df_score - df_score.max(axis=1, keepdims=True))
        proba = ex / ex.sum(axis=1, keepdims=True)
else:
    raise ValueError("Model needs predict_proba or decision_function.")

y_pred = proba.argmax(axis=1)
conf   = proba.max(axis=1)

# 2) 定义“边界度”（margin）：二分类用 |p-0.5|，多分类用前两名概率差
if proba.shape[1] == 2:
    margin = np.abs(proba[:,1] - 0.5)
else:
    top2 = np.sort(proba, axis=1)[:,-2:]
    margin = top2[:,1] - top2[:,0]

# 3) 汇总成 DataFrame
vis_df = pd.DataFrame({
    "text": test_df["text"].values,
    "y_true": y_test,
    "y_pred": y_pred,
    "conf": conf,
    "margin": margin
})
# 标注对错
vis_df["correct"] = (vis_df["y_true"] == vis_df["y_pred"])

# 4) 选样策略
# A. 高置信正确（展示模型“会什么”）
high_conf_correct = vis_df[vis_df["correct"]].sort_values("conf", ascending=False).head(1)

# B. 高置信错分（展示模型“错在哪”）
high_conf_wrong = vis_df[~vis_df["correct"]].sort_values("conf", ascending=False).head(1)

# C. 边界样本（模型最犹豫）
boundary_cases = vis_df.sort_values("margin", ascending=True).head(1)


# high_conf_correct = vis_df[vis_df["correct"]].nlargest(2, "conf")
# boundary_cases    = vis_df.nsmallest(2, "margin")

selected = pd.concat([high_conf_correct, high_conf_wrong, boundary_cases]).drop_duplicates()
selected.reset_index(drop=True, inplace=True)
selected


Unnamed: 0,text,y_true,y_pred,conf,margin,correct
0,fox attacks blair s tory lies tony blair lied ...,2,2,0.992729,0.989752,True
1,saudi ministry to employ women women will be e...,0,2,0.733045,0.592967,False
2,pc ownership to double by NUM the number of pe...,4,0,0.491211,0.003794,False


In [None]:
!pip install shap==0.46.0 lime==0.2.0.1
import shap, matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer

# 准备文本列表
sel_texts = selected["text"].tolist()

# —— SHAP：文本解释（建议使用 Text Masker + Explainer） ——
masker = shap.maskers.Text()
def predict_proba_on_text(texts):
    X = vectorizer.transform(texts)
    return clf.predict_proba(X) if hasattr(clf,"predict_proba") else proba  # 若已校准则有 predict_proba

# 二分类用 logit，多分类用 identity
n_classes = proba.shape[1]
link_fn = shap.links.logit if n_classes == 2 else shap.links.identity
explainer = shap.Explainer(predict_proba_on_text, masker=masker, link=link_fn)

shap_values = explainer(sel_texts, max_evals=2000)

# 保存 SHAP 文本高亮图（逐样本）
for i, txt in enumerate(sel_texts):
    shap.plots.text(shap_values[i], display=True)
    plt.title(f"SHAP explanation #{i}")
    plt.savefig(f"explain_shap_{i}.png", dpi=200, bbox_inches="tight")
    plt.close()

# —— LIME：局部解释（可选） ——
try:
    class_names = list(le.classes_)
except:
    class_names = [f"class_{i}" for i in range(n_classes)]

lime_explainer = LimeTextExplainer(class_names=class_names)
for i, txt in enumerate(sel_texts):
    exp = lime_explainer.explain_instance(txt, predict_proba_on_text, num_features=10)
    exp.save_to_file(f"explain_lime_{i}.html")  # 也可 .as_list() 打印条目


