In [5]:
import pandas as pd
import numpy as np 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_curve, average_precision_score 

In [3]:
def basic_clean(s : str) -> str:
    s = s.lower()
    s = re.sub(r"<[^>]+>", " ", s)
    s = re.sub(r"https?://\S+|www\.\S+", " ", s)
    s = re.sub(r"\+?\d[\d\-\s]{6,}\d", " ", s)
    s = re.sub(r"\d+", " <num> ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [4]:
def load_data(csv_path: str):
    df = pd.read_csv(csv_path)
    df = df.dropna(subset=["Body", "Label"])
    X = df["Body"].tolist()
    y = df["Label"].to_numpy()
    return X, y, df

In [None]:
def build_pipeline(mode = "word"):
    if mode == "word":
        tfidf = TfidfVectorizer(
                lowercase = False,
                ngram_range = (1, 2),
                max_df = 0.95,
                min_df = 2,
                sublinear = True
        )
        clf = LogisticRegression(
            max_iter=2000,
            C=2.0,
            class_weight="balanced",
            n_jobs=-1
        )
        return Pipeline([("tfidf", tfidf)], [("clf", clf)])
    if mode == "char":
        tfidf = TfidfVectorizer(
            analyzer="char_wb",
            ngram_range = (3, 5),
            max_df = 0.95,
            min_df = 2,
            sublinear = True
        )
        clf = LogisticRegression(
            max_iter=2000,
            C=2.0,
            class_weight="balanced",
            n_jobs=-1
        )
        return Pipeline([("tfidf", tfidf), ("clf", clf)])
    else: raise ValueError("must be word or char")


In [None]:
def evaluate(y_true, proba, thr=0.5, title=""):
    y_pred = (proba >= thr).astype(int)
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion matrix: \n", confusion_matrix(y_true, y_pred))
    pr_auc = average_precision_score(y_true, proba)
    print("PR-AUC: %.4f" % pr_auc)

    precision, recall, thresholds = precision_recall_curve(y_true, proba)
    plt.figure()
    plt.plot(recall, precision, label=f"PR curve (AP={pr_auc:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"Precision-Recall {title}")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
def main():
    