## Install packages

In [None]:
%%time
!pip install sentence-transformers hf_xet
#jina-embeddings benefit from FlashAttention-2
#!pip install flash-attn --no-build-isolation
#!pip install -v -U flash-attn
!pip install ninja pyarrow
!pip install pyod catboost
!pip install ftfy emoji einops
!pip install jupyter_capture_output
!pip install --no-deps dask-expr
!pip uninstall -y scipy
!pip cache purge  # Clear cached versions
!pip install --upgrade --force-reinstall scipy==1.11.4
!pip install betacal roc_utils
print("\n")

## Load packages

In [None]:
%%time
import time, os, re, torch, ftfy, emoji, jupyter_capture_output
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import multiprocessing as mp
from tqdm import tqdm
from tqdm.dask import TqdmCallback
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from pyod.models.hbos import HBOS
from catboost import CatBoostClassifier
from betacal import BetaCalibration
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score, cohen_kappa_score, precision_recall_curve, average_precision_score
from roc_utils import *
import seaborn as sns
#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning, module="flash_attn.ops.triton.layer_norm")

## Load dataset

In [None]:
# research code for HIPSTer project: Hybrid, Information, Psychological, Societal Threats
# handling system for public security domain practitioners, businesses, and education (HIPSTer)

dataset = "LtHate" # choose text dataset: LtHate RuToxic DynaHate
datasetFolder = './data/'
resultsFolder = './results/'
vectors = ["gte", "snow", "jina", "e5"] # choose among modern vectorizers: gte snow jina e5
chunk_size = 1024 # large (if gte not used or many texts) or small (if GPU resources are limited)
batch_size_setting = 64 # large (if gte not used or many texts) or small (if GPU resources are limited)
compDevice = 'cuda' if torch.cuda.is_available() else 'cpu'
useDimensionalityReduction = True
useCatBoostOutputCallibration = True

k = 10 # Number of folds for cross-validation with StratifiedKFold, i.e. 10-fold CV
num_vars = 64 # Number of variables after dimensionality reduction with PCA
num_tree = 200 # Maximum possible number of trees to grow for CatBoost model

In [None]:
cvLogFilename = resultsFolder + dataset + "-CV-log.txt"
resultsTableFile = resultsFolder + dataset + "-table.txt"
resultsTableSummary = resultsFolder + dataset + "-table.csv"
rocPlotFilename = resultsFolder + dataset + "-fig-ROC.png"
prcPlotFilename = resultsFolder + dataset + "-fig-PRC.png"
if not os.path.exists(resultsFolder):
    os.makedirs(resultsFolder)

In [None]:
def fix_punctuation(text, toneDown=True):
    # First, use ftfy to fix any encoding issues
    if hasattr(text, '__len__'):
        text = ftfy.fix_text(text)

        # Custom rules for punctuation fixing
        rules = [
            # Remove http and https links
            (r'https?://\S+', ''),
            # Remove consecutive repetitive punctuation, but keep a maximum of two for emphasis (e.g., !!)
            (r'([,\.?!])\1{2,}', r'\1\1'),
            # Add space after comma, period, question mark, or exclamation mark if not followed by space
            (r'([,\.?!])(?=[^\s])', r'\1 '),
            # Remove space before comma, period, question mark, or exclamation mark
            (r'\s+([,\.?!])', r'\1'),
            # Fix multiple spaces
            (r'\s{2,}', ' '),
            # Ensure numbers have space before and after, except when punctuation or hyphen follows
            (r'(\d)(?=[^\s\d,\.?!-])', r'\1 '),
            (r'(?<=[^\s\d-])(\d)', r' \1')
        ]

        if toneDown:
            rules.append((r'[?!]', '.'))

        # Replace emoji with :shortcode:
        text = emoji.demojize(text, delimiters=(" ::", ":: "))

        # Apply each rule
        for pattern, replacement in rules:
            text = re.sub(pattern, replacement, text)

        text = text.strip()
    else:
        text = ''
    return text

In [None]:
def clean_and_vectorize(df, fix_punctuation, sentvec="e5", device="cuda", normalize_embeddings=False, chunk_size=256, batch_size=32):
    """
    Cleans text data and vectorizes it using a sentence transformer model.

    Parameters:
    - df (pd.DataFrame): Input pandas DataFrame containing text data.
    - fix_punctuation (function): Function to preprocess and fix punctuation in text.
    - sentvec (str): Identifier for sentence vectorization method, e.g., "e5-large-instruct".
    - device (str, default="cuda"): Device to use for the sentence transformer model, e.g., "cpu" or "cuda".
    - normalize_embeddings (bool): Whether to normalize embeddings.
    - chunk_size (int, default=1024): Number of text samples per chunk for encoding.
    - batch_size (int, default=128): Batch size for encoding in the transformer.

    Returns:
    - pd.DataFrame: DataFrame containing the resulting embeddings with appropriate column names.
    """

    # Load the sentence transformer model based on `sentvec`    
    if sentvec == "jina":
        st = SentenceTransformer('jinaai/' + sentvec + '-embeddings-v3', trust_remote_code=True, device=device)
    elif sentvec == "snow":
        st = SentenceTransformer('Snowflake/' + sentvec + 'flake-arctic-embed-l-v2.0', device=device)    
    elif sentvec == "gte":
        st = SentenceTransformer('Alibaba-NLP/' + sentvec + '-Qwen2-1.5B-instruct', device=device)    
    else: # "e5"
        st = SentenceTransformer('intfloat/multilingual-' + sentvec + '-large-instruct', device=device)

    # Partition data for parallel processing
    n_partitions = mp.cpu_count()
    ddf = dd.from_pandas(df, npartitions=n_partitions)

    with TqdmCallback(desc="Cleaning text"):
        texts = ddf.apply(lambda x: fix_punctuation(x.iloc[1]), axis=1, meta=pd.Series(dtype="str")).compute(scheduler="processes").tolist()

    # Determine prompt for the selected sentence vectorization method
    st_prompt = "query: " if sentvec == "e5" else ""

    # Process texts in chunks for memory efficiency
    results = []
    total_chunks = (len(texts) + chunk_size - 1) // chunk_size  # Calculate total number of chunks
    width = len(str(total_chunks))  # Width for chunk number formatting

    for i in range(0, len(texts), chunk_size):
        torch.cuda.empty_cache()  # Clear GPU cache if necessary
        current_chunk = i // chunk_size + 1  # Current chunk number
        print(f"\rChunk: {current_chunk:0{width}}/{total_chunks:0{width}} ", end='', flush=True)
        chunk = texts[i:i + chunk_size]
        result = st.encode(chunk, batch_size=batch_size, normalize_embeddings=normalize_embeddings, show_progress_bar=True)
        results.append(result)

    # Concatenate results and format as DataFrame
    X = np.concatenate(results, axis=0)
    df_embeddings = pd.DataFrame(X)
    df_embeddings.columns = [f'X{i+1}' for i in range(df_embeddings.shape[1])]

    return df_embeddings

In [None]:
%%time

# Load text comments dataset
if dataset == 'DynaHate':
    df = pd.read_csv(datasetFolder + 'DynaHate.csv', engine='python', usecols=['text','label'])
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    di = {"nothate": 0, "hate": 1}
    y = df[0].map(di)
elif dataset == 'LtHate':
    dfA = pd.read_csv(datasetFolder + 'LtHate.csv', engine='python')
    dfA = dfA[dfA.columns[::-1]]
    dfA.columns = [0, 1]
    di = {"No": 0, "Yes": 1}
    yA = dfA[0].map(di)
    dfB = pd.read_csv(datasetFolder + 'Semantika_2.txt', sep='__', header=None, usecols=[2], engine='python')
    dfB = dfB[2].str.split(" ", n=1, expand=True)
    di = {"neutral": 0, "offensive": 1}
    yB = dfB[0].map(di)
    df = pd.concat([dfA, dfB], ignore_index=True)
    y = pd.concat([yA, yB], ignore_index=True)        
elif dataset == 'LtEmocionalumas':
    df = pd.read_csv(datasetFolder + 'LtEmocionalumas.csv', engine='python')
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    di = {"No": 0, "Low": 0, "Medium": 1, "High": 1, "Critical": 1}
    y = df[0].map(di)
elif dataset == 'LtManipuliacijos':
    excel_file = pd.ExcelFile(datasetFolder + 'LtManipuliacijos.xlsx')
    all_comments = []
    all_labels = []
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(datasetFolder + 'LtManipuliacijos.xlsx', sheet_name=sheet_name, usecols=[1])
        sheet_comments = df['Komentaras'].astype(str).str.strip()
        sheet_comments = sheet_comments[sheet_comments != ''].tolist()
        all_comments.extend(sheet_comments)
        if sheet_name == 'Manipuliaciniai':
            all_labels.extend([1] * len(sheet_comments))
        else:
            all_labels.extend([0] * len(sheet_comments))
    df = pd.DataFrame({0: all_labels, 1: all_comments})
    y = df[0]      
elif dataset == 'RuToxic':
    df = pd.read_csv(datasetFolder + 'RuToxic.csv', engine='python')
    df = df[df.columns[::-1]]
    df.columns = [0, 1]
    y = df[0]
else:
    df = pd.read_csv('Semantika_2.txt', sep='__', header=None, usecols=[2], engine='python')
    df = df[2].str.split(" ", n=1, expand=True)
    di = {"neutral": 0, "offensive": 1}
    y = df[0].map(di)

In [None]:
%%time

# Vectorize text data
for sentvec in vectors:
    xVarFilename = datasetFolder + dataset + "-X-" + sentvec + ".parquet"
    files_exist_condition = os.path.exists(xVarFilename)
    if not files_exist_condition:
        try:
            print("Vectorizing text on GPU 0...")
            torch.cuda.set_device(0)
            torch.cuda.empty_cache()
            df_post = clean_and_vectorize(df, fix_punctuation, sentvec, compDevice, False, chunk_size, batch_size_setting)
            df_post.to_parquet(xVarFilename, engine='pyarrow')
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print("Vectorizing text on GPU 1...")
                torch.cuda.set_device(1)
                torch.cuda.empty_cache()
                df_post = clean_and_vectorize(df, fix_punctuation, sentvec, compDevice, False, chunk_size, batch_size_setting)
                df_post.to_parquet(xVarFilename, engine='pyarrow')
            else:
                # Re-raise if it's not an OOM error
                raise
        torch.cuda.empty_cache()

# Read vectorized data
XX = []
for sentvec in vectors:
    xVarFilename = datasetFolder + dataset + "-X-" + sentvec + ".parquet"
    df_post = dd.read_parquet(xVarFilename, engine='pyarrow')
    XX.append(df_post.compute().to_numpy())

## Machine learning (CV using FOR loop)

In [None]:
def autoflip_score(y_true, y_scores):
    # Calculate ROC AUC scores for both variants
    auc_score = roc_auc_score(y_true, y_scores)
    auc_score_inverted = roc_auc_score(y_true, 1 - y_scores)
    # Determine which AUC is higher and return the corresponding scores
    if auc_score > auc_score_inverted:
        return y_scores
    else:
        return 1 - y_scores

In [None]:
%%time
%%capture_text --path $cvLogFilename

X = range(len(y))
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
cb_task_type = 'GPU' if compDevice == 'cuda' else 'CPU' #torch.cuda.set_device(1)

# Predicted scores to concatenate results across validation folds
tst_idx, ground_truth = [], []
os_scores = [[] for _ in vectors]  # One list per vectorizer
cb_scores = [[] for _ in vectors]  # One list per vectorizer

i = 0 # Perform the k-fold cross-validation
for train_index, test_index in skf.split(X, y):

    y_train, y_test = y[train_index], y[test_index]
    tst_idx.extend(test_index)
    ground_truth.extend(y_test.tolist())

    print("\nCV fold %d/%d" % (i + 1, k), flush=True)

    os_fold_scores = []
    cb_fold_scores = []

    # One-class (1c) classification: Histogram-based outlier score (HBOS)
    for j, sentvec in enumerate(vectors):
        start = pd.Timestamp.now()
        X_train, X_test = XX[j][train_index], XX[j][test_index]
        if useDimensionalityReduction:
            pca = PCA(n_components=num_vars)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)
        clf = HBOS(contamination=0.01)
        clf.fit(X_train[y_train==1,:])
        score = -1 * clf.decision_function(X_test) / 10000 # make score more aesthetic (calibration)
        score = autoflip_score(y_test, score) # HBOS is 1c but ROC/PRC are for 2c so some sanity check
        print("pyodHBOS %s AUC=%5.3f" % (vectors[j], roc_auc_score(y_test, score)))
        os_scores[j].extend(score.tolist())
        print(str(pd.Timestamp.now()-start))
        

    # Two-class (2c) classification: CatBoost classifier (detection task)
    for j, sentvec in enumerate(vectors):
        start = pd.Timestamp.now()
        X_train, X_test = XX[j][train_index], XX[j][test_index]
        if useDimensionalityReduction:
            pca = PCA(n_components=num_vars)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)
        model = CatBoostClassifier(iterations=num_tree, learning_rate=0.1, task_type=cb_task_type, allow_writing_files=False)
        X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)
        model.fit(X_trn, y_trn, eval_set=(X_val, y_val), use_best_model=True, early_stopping_rounds=10, verbose=False)
        score = model.predict_proba(X_test)[:, 1]
        if useCatBoostOutputCallibration:
            prob_pos_val = model.predict_proba(X_val)[:, 1]
            beta_calibrator = BetaCalibration(parameters="abm")
            beta_calibrator.fit(prob_pos_val, y_val)
            score = beta_calibrator.predict(score)
    
        print("catBoost %s AUC=%5.3f" % (vectors[j], roc_auc_score(y_test, score)))
        cb_scores[j].extend(score.tolist())
        print(str(pd.Timestamp.now()-start))
        
    i = i + 1
    
torch.cuda.empty_cache()
print("\n")

## Detection task results

In [None]:
# plot ROC
thrType = "minopt"
showOpt = False
roc_results = []
fig, ax3 = plt.subplots()
colors = sns.color_palette("pastel", len(os_scores))
for i, vec_scores in enumerate(os_scores):
    roc = compute_roc(X=vec_scores, y=ground_truth, pos_label=1, objective=thrType)
    roc_results.append(roc)
    label = f"1c {vectors[i]}"
    plot_roc(roc, label=label, color=colors[i], ax=ax3, show_opt=showOpt)
for i, vec_scores in enumerate(cb_scores):
    roc = compute_roc(X=vec_scores, y=ground_truth, pos_label=1, objective=thrType)
    roc_results.append(roc)
    label = f"2c {vectors[i]}"
    plot_roc(roc, label=label, color=f"C{i}", ax=ax3, show_opt=showOpt)
ax3.set_title(f"{dataset} (n={len(y)}, target={100*np.sum(y==1)/len(y):.2f}%) → ROC")
ax3.legend()
plt.show()
fig.savefig(rocPlotFilename)

In [None]:
# plot PRC
prc_results = []
fig, ax3 = plt.subplots()
colors = sns.color_palette("pastel", len(os_scores))
for i, vec_scores in enumerate(os_scores):
    precision, recall, _ = precision_recall_curve(ground_truth, vec_scores, pos_label=1)
    ap = average_precision_score(ground_truth, vec_scores, pos_label=1)
    prc_results.append(ap)
    label = f"1c {vectors[i]} (AUC={ap:.3f})"
    ax3.plot(recall, precision, label=label, color=colors[i])
for i, vec_scores in enumerate(cb_scores):
    precision, recall, _ = precision_recall_curve(ground_truth, vec_scores, pos_label=1)
    ap = average_precision_score(ground_truth, vec_scores, pos_label=1)
    prc_results.append(ap)
    label = f"1c {vectors[i]} (AUC={ap:.3f})"
    ax3.plot(recall, precision, label=label, color=f"C{i}")
ax3.set_title(f"{dataset} (n={len(y)}, target={100*np.sum(y==1)/len(y):.2f}%) → PRC")
ax3.set_xlabel("Recall")
ax3.set_ylabel("Precision")
ax3.set_xlim([-0.02, 1.02])
ax3.set_ylim([-0.02, 1.02])
ax3.set_aspect('equal', adjustable='box')
ax3.grid(color='darkgrey', linestyle='-', linewidth=0.5)
ax3.legend(loc='lower left')
plt.show()
fig.savefig(prcPlotFilename)

In [None]:
%%capture_text --path $resultsTableFile

# Constants
kappa_line = "cohens kappa                           %4.2f\n"
title_line = '\n-----------------------------------------------------\n'

# Ultra-compact score collections
models = [('1c', os_scores), ('2c', cb_scores)]

# Initialize list to store CSV data
csv_data = []

# Process all scores
result_idx = 0
for prefix, scores_list in models:
    for i, vec_scores in enumerate(scores_list):
        roc, ap = roc_results[result_idx], prc_results[result_idx]
        variant_name = vectors[i]
        
        # Calculate predictions and metrics
        predictions = vec_scores > roc.opd[thrType].opt
        accuracy = accuracy_score(ground_truth, predictions)
        kappa = cohen_kappa_score(ground_truth, predictions)
        
        # Print stylized results (original format)
        print(f'\n'
              f'{prefix} {variant_name} : {dataset} AUC-ROC = {roc.auc:.3f}\n'
              f'{prefix} {variant_name} : {dataset} AUC-PRC = {ap:.3f}'
              f'{title_line}'
              f'{classification_report(ground_truth, predictions)}'
              f'{kappa_line % kappa}'
              f'{title_line}')
        
        # Collect data for CSV
        csv_data.append({
            'Model': f'{prefix} {variant_name}',
            'Accuracy': f'{accuracy * 100:.2f} %',
            'Kappa': kappa,
            'AUC-ROC': roc.auc,
            'AUC-PRC': ap
        })
              
        result_idx += 1

# Save results to CSV
df = pd.DataFrame(csv_data)
csv_filename = resultsTableSummary  # Using the resultsTableSummary string as filename
df.to_csv(csv_filename, index=False, float_format='%.3f')
print(f'\nResults saved to CSV: {csv_filename}')