In [1]:
# If you haven't installed these already, uncomment and run the installs.
# !pip install rank_bm25 sentence-transformers scikit-learn pandas

import json
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_fscore_support,
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Your existing project imports; adjust if module paths differ
from retrieval import build_retrievers
from utils import load_topics, load_cleaned_documents
from text_normalizer import normalize_medical_text


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === Configuration (replace paths as needed) ===
DATA_ROOT = Path("data")  # base folder
TOP_K = 5
NORMALIZE = True  # set to False to skip medical text normalization
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"  # sentence-transformers model
CLASSIFIER_TYPE = "logreg"  # "logreg" or "rf"
RANDOM_STATE = 42

In [27]:
def get_topic_name_map(topic2id: dict) -> dict:
    return {v: k for k, v in topic2id.items()}

def build_statement_level_examples(
    retriever,
    statements_dir: Path,
    answers_dir: Path,
    topic_name_map: dict,
    normalize: bool,
    top_k: int,
) -> List[Dict[str, Any]]:
    examples = []
    for stmt_path in sorted(statements_dir.glob("statement_*.txt")):
        base = stmt_path.stem
        answer_path = answers_dir / f"{base}.json"
        if not answer_path.exists():
            continue
        statement_text = stmt_path.read_text(encoding="utf-8")
        with answer_path.open("r", encoding="utf-8") as f:
            answer = json.load(f)
        true_topic = answer.get("statement_topic")
        if true_topic is None:
            continue
        if normalize:
            statement_text = normalize_medical_text(statement_text)

        retrieved = retriever.get_relevant_documents(statement_text)[:top_k]
        retrieved_snippets = []
        snippet_labels = []
        for i, doc in enumerate(retrieved):
            topic_id = doc.metadata.get("topic_id", -1)
            snippet = {
                "rank": i + 1,
                "topic_id": topic_id,
                "topic_name": topic_name_map.get(topic_id, ""),
                "source": doc.metadata.get("source", ""),
                "section_header": doc.metadata.get("section_header", ""),
                "subsection_index": doc.metadata.get("subsection_index", None),
                "total_subsections": doc.metadata.get("total_subsections", None),
                "chunk_text": doc.page_content,
            }
            retrieved_snippets.append(snippet)
            snippet_labels.append(topic_id == true_topic)

        example = {
            "statement_id": base,
            "statement_text": statement_text,
            "is_true": bool(answer.get("statement_is_true")),
            "true_topic_id": true_topic,
            "true_topic_name": topic_name_map.get(true_topic, ""),
            "retrieved_snippets": retrieved_snippets,
            "snippet_labels": snippet_labels,
        }
        examples.append(example)
    return examples

def build_flattened_snippet_examples(
    statement_level_examples: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
    flat = []
    for ex in statement_level_examples:
        for i, snip in enumerate(ex["retrieved_snippets"]):
            flat_example = {
                "statement_id": ex["statement_id"],
                "statement_text": ex["statement_text"],
                "is_true": ex["is_true"],
                "true_topic_id": ex["true_topic_id"],
                "true_topic_name": ex["true_topic_name"],
                "snippet_text": snip["chunk_text"],
                "snippet_topic_id": snip["topic_id"],
                "snippet_topic_name": snip["topic_name"],
                "snippet_relevant": ex["snippet_labels"][i],
                "is_top1": snip["rank"] == 1,
                "rank": snip["rank"],
            }
            flat.append(flat_example)
    return flat


def make_dataset(retriever, dir, topic_name_map, normalize, top_k):
    statements_dir = dir / "statements"
    answers_dir = dir / "answers"
    statement_level = build_statement_level_examples(
        retriever, statements_dir, answers_dir, topic_name_map, normalize, top_k
    )
    flat = build_flattened_snippet_examples(statement_level)
    df = pd.DataFrame(flat)
    return df

# Load topics and documents
topics_json = DATA_ROOT / "topics.json"
cleaned_root = DATA_ROOT / "cleaned_topics"
train_dir = DATA_ROOT / "synthetic"
val_dir = DATA_ROOT / "train"

topic2id, _ = load_topics(topics_json)
topic_name_map = get_topic_name_map(topic2id)

documents = load_cleaned_documents(cleaned_root, topic2id, normalize=NORMALIZE)
retriever = build_retrievers(documents)

df_train = make_dataset(retriever, train_dir, topic_name_map, normalize=NORMALIZE, top_k=TOP_K)
print(f"Created {len(df_train)} training examples")
df_val = make_dataset(retriever, val_dir, topic_name_map, normalize=NORMALIZE, top_k=TOP_K)
print(f"Created {len(df_val)} validation examples")

Split 208 documents into 11383 section-based chunks
Created 7120 training examples
Created 1000 validation examples


In [15]:
def tokenize(text: str) -> List[str]:
    return text.lower().split()

def compute_features(df: pd.DataFrame, embedding_model: SentenceTransformer) -> pd.DataFrame:
    # Prepare TF-IDF globally
    all_texts = pd.concat([df["statement_text"], df["snippet_text"]]).unique().tolist()
    tfidf_vec = TfidfVectorizer().fit(all_texts)

    # Embedding cache
    unique_texts = pd.Series(df["statement_text"].tolist() + df["snippet_text"].tolist()).unique()
    embed_cache = {}
    for text in unique_texts:
        embed_cache[text] = embedding_model.encode(text, convert_to_numpy=True, normalize_embeddings=True)

    # BM25 per statement
    bm25_scores = []
    for stmt_id, grp in df.groupby("statement_id"):
        snippet_texts = grp["snippet_text"].tolist()
        tokenized_snips = [tokenize(t) for t in snippet_texts]
        bm25 = BM25Okapi(tokenized_snips)
        stmt = grp["statement_text"].iloc[0]
        tokenized_query = tokenize(stmt)
        scores = sorted(bm25.get_scores(tokenized_query), reverse=True)
        bm25_scores.extend(scores)
    df["bm25_score"] = bm25_scores

    # TF-IDF cosine
    stmt_tfidf = tfidf_vec.transform(df["statement_text"])
    snip_tfidf = tfidf_vec.transform(df["snippet_text"])
    tfidf_cosines = cosine_similarity(stmt_tfidf, snip_tfidf)
    df["tfidf_cosine"] = tfidf_cosines.diagonal()

    # Embedding cosine (normalized vectors -> dot product)
    emb_cos = []
    for stmt, snip in zip(df["statement_text"], df["snippet_text"]):
        v1 = embed_cache[stmt]
        v2 = embed_cache[snip]
        emb_cos.append(float(np.dot(v1, v2)))
    df["embed_cosine"] = emb_cos

    # Token overlap and Jaccard
    overlaps = []
    jaccards = []
    for stmt, snip in zip(df["statement_text"], df["snippet_text"]):
        set_stmt = set(tokenize(stmt))
        set_snip = set(tokenize(snip))
        inter = set_stmt & set_snip
        union = set_stmt | set_snip
        overlaps.append(len(inter))
        jaccards.append(len(inter) / len(union) if union else 0.0)
    df["token_overlap"] = overlaps
    df["jaccard"] = jaccards

    # Snippet length
    df["snippet_len"] = df["snippet_text"].str.split().apply(len)

    return df

In [None]:
# Load embedding model (this downloads if not cached)
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)

# Compute features in-place
df_train_features = compute_features(df_train.copy(), embedding_model)
df_val_features = compute_features(df_val.copy(), embedding_model)

# Prepare feature matrix and labels
feature_cols = [
    "bm25_score",
    "tfidf_cosine",
    "embed_cosine",
    "token_overlap",
    "jaccard",
    "snippet_len",
]
X_train = df_train_features[feature_cols].fillna(0).to_numpy()
y_train = df_train_features["snippet_relevant"].astype(int).to_numpy()
groups_train = df_train_features["statement_id"].to_numpy()

X_val = df_val_features[feature_cols].fillna(0).to_numpy()
y_val = df_val_features["snippet_relevant"].astype(int).to_numpy()
groups_val = df_val_features["statement_id"].to_numpy()

print("Feature matrix shape:", X_train.shape)
print("Positive snippets:", y_train.sum(), " / total:", len(y_train))

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Feature matrix shape: (7120, 8)
Positive snippets: 4563  / total: 7120


In [59]:
def aggregate_topk_snippets(
    df: pd.DataFrame,
    feature_cols: list[str],
    k: int = 3,
    rank_col: str = "rank",
    statement_id_col: str = "statement_id",
    fill_value=0,
) -> pd.DataFrame:
    """
    For each statement_id, take the top-k snippets (ordered by rank_col ascending)
    and flatten their features into one row: feat_1, feat_2, ..., feat_k.
    If a statement has fewer than k snippets, pads with fill_value.
    """
    # Ensure deterministic ordering per statement
    df_sorted = df.sort_values([statement_id_col, rank_col])

    def flatten(group: pd.DataFrame) -> pd.Series:
        topk = group.head(k)
        out = {statement_id_col: group.name}
        for i in range(k):
            if i < len(topk):
                row = topk.iloc[i]
                for feat in feature_cols:
                    out[f"{feat}_{i+1}"] = row.get(feat, fill_value)
            else:
                for feat in feature_cols:
                    out[f"{feat}_{i+1}"] = fill_value
        # Carry over label (assumes is_true is same for all snippets of a statement)
        out["is_true"] = group["is_true"].iloc[0]
        return pd.Series(out)

    agg_df = (
        df_sorted.groupby(statement_id_col, group_keys=False)
        .apply(flatten)
        .reset_index(drop=True)
    )
    return agg_df


k = 3
agg = aggregate_topk_snippets(df_train_features, feature_cols, k=k)

# Prepare X / y:
# Drop statement_id & label columns to form X.
feature_columns_flat = [col for col in agg.columns if col not in ("statement_id", "is_true")]
X_train = agg[feature_columns_flat].fillna(0).to_numpy()
y_train = agg["is_true"].astype(int).to_numpy()


agg = aggregate_topk_snippets(df_val_features, feature_cols, k=k)

# Prepare X / y:
# Drop statement_id & label columns to form X.
feature_columns_flat = [col for col in agg.columns if col not in ("statement_id", "is_true")]
X_val = agg[feature_columns_flat].fillna(0).to_numpy()
y_val = agg["is_true"].astype(int).to_numpy()

  .apply(flatten)
  .apply(flatten)


In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Normalize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred)

print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.6150


In [61]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42),
}

# Train and evaluate
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    acc = accuracy_score(y_val, y_pred)
    results.append((name, acc))

# Display results
results_df = pd.DataFrame(results, columns=["Model", "Validation Accuracy"])
results_df = results_df.sort_values("Validation Accuracy", ascending=False).reset_index(drop=True)
results_df

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 715, number of negative: 709
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3289
[LightGBM] [Info] Number of data points in the train set: 1424, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502107 -> initscore=0.008427
[LightGBM] [Info] Start training from score 0.008427




Unnamed: 0,Model,Validation Accuracy
0,LightGBM,0.65
1,Gradient Boosting,0.645
2,Random Forest,0.635
3,XGBoost,0.625


In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

# Define models and param grids
models_and_grids = {
    "Random Forest": (
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": [100, 300],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
        },
    ),
    "Gradient Boosting": (
        GradientBoostingClassifier(random_state=42),
        {
            "n_estimators": [100, 300],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5],
        },
    ),
    "XGBoost": (
        xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
        {
            "n_estimators": [100, 300],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 6],
            "subsample": [0.8, 1.0],
        },
    ),
    "LightGBM": (
        lgb.LGBMClassifier(random_state=42),
        {
            "n_estimators": [100, 300],
            "learning_rate": [0.05, 0.1],
            "max_depth": [-1, 10],
            "num_leaves": [31, 64],
        },
    ),
}

# Perform grid search
from sklearn.metrics import accuracy_score

best_models = []
for name, (model, param_grid) in models_and_grids.items():
    print(f"Tuning {name}...")
    grid = GridSearchCV(
        model,
        param_grid,
        cv=3,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )
    grid.fit(X_train_scaled, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_val_scaled)
    acc = accuracy_score(y_val, y_pred)
    best_models.append((name, acc, grid.best_params_))

# Show results
import pandas as pd
results_df = pd.DataFrame(best_models, columns=["Model", "Validation Accuracy", "Best Params"])
results_df = results_df.sort_values("Validation Accuracy", ascending=False).reset_index(drop=True)
results_df

Tuning Random Forest...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Tuning Gradient Boosting...
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Tuning XGBoost...
Fitting 3 folds for each of 16 candidates, totalling 48 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Tuning LightGBM...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322
[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info



[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322




[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538
[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000511 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421
[LightGBM] [Info] Number of 




[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002485 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421
[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322




[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538
[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000755 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421








[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322

[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538





[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421





[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001474 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322
[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017065 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538








[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421

[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538
[LightGBM] [Info] Number of



[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000670 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322
[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538





[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421





[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322




[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000447 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538








[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006777 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421

[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info




[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421
[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000374 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322





[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000590 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538




[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421




[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322




[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538
[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421





[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322






[LightGBM] [Info] Number of positive: 477, number of negative: 472
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002599 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5446
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502634 -> initscore=0.010538
[LightGBM] [Info] Start training from score 0.010538










[LightGBM] [Info] Number of positive: 477, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000869 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5456
[LightGBM] [Info] Number of data points in the train set: 950, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502105 -> initscore=0.008421
[LightGBM] [Info] Start training from score 0.008421
[LightGBM] [Info] Number of positive: 476, number of negative: 473
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5447
[LightGBM] [Info] Number of data points in the train set: 949, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501581 -> initscore=0.006322
[LightGBM] [Info] Start training from score 0.006322
[LightGBM] [Info] Number of 























[LightGBM] [Info] Number of positive: 715, number of negative: 709
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5479
[LightGBM] [Info] Number of data points in the train set: 1424, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502107 -> initscore=0.008427
[LightGBM] [Info] Start training from score 0.008427




Unnamed: 0,Model,Validation Accuracy,Best Params
0,Gradient Boosting,0.645,"{'learning_rate': 0.05, 'max_depth': 3, 'n_est..."
1,LightGBM,0.645,"{'learning_rate': 0.1, 'max_depth': -1, 'n_est..."
2,XGBoost,0.64,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti..."
3,Random Forest,0.63,"{'max_depth': None, 'min_samples_split': 2, 'n..."


In [46]:
import numpy as np
import pandas as pd

# Get feature names (assumes you have this from previous step)
# feature_columns_flat = [col for col in agg.columns if col not in ("statement_id", "is_true")]

# Coefficients from logistic regression (1D array for binary classification)
coefs = clf.coef_.flatten()

# Create DataFrame for feature importance
importance_df = pd.DataFrame({
    "feature": feature_columns_flat,
    "coefficient": coefs,
    "abs_importance": np.abs(coefs)
})

# Sort by absolute importance
importance_df = importance_df.sort_values("abs_importance", ascending=False).reset_index(drop=True)

print(importance_df)


            feature  coefficient  abs_importance
0   token_overlap_2     2.414464        2.414464
1         jaccard_2    -2.002595        2.002595
2   token_overlap_3     1.847365        1.847365
3         jaccard_3    -1.450920        1.450920
4   token_overlap_1     1.219377        1.219377
5     snippet_len_2    -0.702668        0.702668
6         jaccard_1    -0.566192        0.566192
7     snippet_len_3    -0.513112        0.513112
8     snippet_len_1    -0.464565        0.464565
9      bm25_score_1     0.420321        0.420321
10   tfidf_cosine_3     0.328073        0.328073
11   tfidf_cosine_1    -0.228928        0.228928
12     bm25_score_3    -0.228226        0.228226
13   embed_cosine_3     0.193105        0.193105
14   embed_cosine_2     0.151785        0.151785
15     bm25_score_2     0.105541        0.105541
16   tfidf_cosine_2     0.094084        0.094084
17   embed_cosine_1     0.023971        0.023971


In [36]:
import joblib

# Save trained model
MODEL_PATH = Path("trained_snippet_classifier.pkl")
joblib.dump(model, MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")


Saved model to trained_snippet_classifier.pkl
