In [9]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import sys
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import precision_recall_curve, f1_score, precision_score, recall_score

sys.path.append('/home/kalkiek/projects/biber-multidimensional-register-analysis/')

from modeling.neurobiber.tagger import load_model_and_tokenizer, predict_batch

TEST_FP = '/shared/3/projects/hiatus/tagged_data/biber-aggregate/binary_test.jsonl'
RESULTS_DIR = '/shared/3/projects/hiatus/tagged_data/biber-aggregate/evaluation/'

tag_counts = pd.read_csv('/shared/3/projects/hiatus/tagged_data/biber-aggregate/tag_counts.csv')
tag_names = [
    "BIN_QUAN", "BIN_QUPR", "BIN_AMP", "BIN_PASS", "BIN_XX0", "BIN_JJ", 
    "BIN_BEMA", "BIN_CAUS", "BIN_CONC", "BIN_COND", "BIN_CONJ", "BIN_CONT", 
    "BIN_DPAR", "BIN_DWNT", "BIN_EX", "BIN_FPP1", "BIN_GER", "BIN_RB", 
    "BIN_PIN", "BIN_INPR", "BIN_TO", "BIN_NEMD", "BIN_OSUB", "BIN_PASTP", 
    "BIN_VBD", "BIN_PHC", "BIN_PIRE", "BIN_PLACE", "BIN_POMD", "BIN_PRMD", 
    "BIN_WZPRES", "BIN_VPRT", "BIN_PRIV", "BIN_PIT", "BIN_PUBV", "BIN_SPP2", 
    "BIN_SMP", "BIN_SERE", "BIN_STPR", "BIN_SUAV", "BIN_SYNE", "BIN_TPP3", 
    "BIN_TIME", "BIN_NOMZ", "BIN_BYPA", "BIN_PRED", "BIN_TOBJ", "BIN_TSUB", 
    "BIN_THVC", "BIN_NN", "BIN_DEMP", "BIN_DEMO", "BIN_WHQU", "BIN_EMPH", 
    "BIN_HDG", "BIN_WZPAST", "BIN_THAC", "BIN_PEAS", "BIN_ANDC", "BIN_PRESP", 
    "BIN_PROD", "BIN_SPAU", "BIN_SPIN", "BIN_THATD", "BIN_WHOBJ", "BIN_WHSUB", 
    "BIN_WHCL", "BIN_ART", "BIN_AUXB", "BIN_CAP", "BIN_SCONJ", "BIN_CCONJ", 
    "BIN_DET", "BIN_EMOJ", "BIN_EMOT", "BIN_EXCL", "BIN_HASH", "BIN_INF", 
    "BIN_UH", "BIN_NUM", "BIN_LAUGH", "BIN_PRP", "BIN_PREP", "BIN_NNP", 
    "BIN_QUES", "BIN_QUOT", "BIN_AT", "BIN_SBJP", "BIN_URL", "BIN_WH", 
    "BIN_INDA", "BIN_ACCU", "BIN_PGAS", "BIN_CMADJ", "BIN_SPADJ", "BIN_X"
]

tag_counts.columns = ['train', 'dev', 'test']
tag_counts.index = tag_names

# Display the updated DataFrame
tag_counts.head()

Unnamed: 0,train,dev,test
BIN_QUAN,22938268,2867473,2867377
BIN_QUPR,8669526,1084629,1084727
BIN_AMP,19393951,2423794,2425078
BIN_PASS,31412182,3926511,3926831
BIN_XX0,27367095,3420176,3420386


In [10]:
rare_tags = tag_counts.sort_values('train').head(10)
print("10 rarest tags in training set:")
print(rare_tags[['train']].to_string())

10 rarest tags in training set:
             train
BIN_X        87358
BIN_ACCU    371909
BIN_QUOT    737904
BIN_SPADJ   786012
BIN_INDA    865263
BIN_URL    1908162
BIN_SBJP   2248697
BIN_CMADJ  2319931
BIN_CCONJ  2736093
BIN_PGAS   2747924


**Load in the test data**

In [11]:
def tag_test_json(input_file, batch_size=128, text_key="text", show_progress=True):
    model, tokenizer = load_model_and_tokenizer()
    all_predictions = []
    all_labels = []
    
    with open(input_file, 'r', encoding='utf-8') as fin:
        batch = []
        label_batch = []
        
        iterator = tqdm(fin, desc="Processing texts") if show_progress else fin
        
        for line in iterator:
            line = line.strip()
            if not line:
                continue
                
            try:
                data = json.loads(line)
                feature_keys = list(data["features"].keys())
                label_keys = [k for k in feature_keys if k.endswith("_mean")]
                labels = [1.0 if float(data["features"][key]) != 0.0 else 0.0 for key in label_keys]
                
                batch.append(data[text_key])
                label_batch.append(labels)
                
                if len(batch) >= batch_size:
                    predictions = predict_batch(model, tokenizer, batch)
                    all_predictions.extend(predictions)
                    all_labels.extend(label_batch)
                    batch = []
                    label_batch = []
                    
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {e}")
                continue
        
        # Handle remaining batch
        if batch:
            predictions = predict_batch(model, tokenizer, batch)
            all_predictions.extend(predictions)
            all_labels.extend(label_batch)
    
    return np.array(all_predictions), np.array(all_labels)

In [12]:
predictions, labels = tag_test_json(TEST_FP)

2025-02-17 13:09:42.069683: I tensorflow/core/platform/cpu_feature_guard.cc:181] Beginning TensorFlow 2.15, this package will be updated to install stock TensorFlow 2.15 alongside Intel's TensorFlow CPU extension plugin, which provides all the optimizations available in the package and more. If a compatible version of stock TensorFlow is present, only the extension will get installed. No changes to code or installation setup is needed as a result of this change.
More information on Intel's optimizations for TensorFlow, delivered as TensorFlow extension plugin can be viewed at https://github.com/intel/intel-extension-for-tensorflow.
2025-02-17 13:09:42.069715: I tensorflow/core/platform/cpu_feature_guard.cc:192] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Processing texts: 3960277it [3:48:58, 288.25i

In [13]:
print(f"Total samples: {labels.shape[0]}")
print(f"Number of features: {labels.shape[1]}")

print(f"Total prediction samples: {predictions.shape[0]}")
print(f"Number of prediction features: {predictions.shape[1]}")

Total samples: 3960277
Number of features: 96
Total prediction samples: 3960277
Number of prediction features: 96


In [14]:
def save_results(predictions, labels, output_dir="results"):
    # Create results directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Save predictions and labels as compressed numpy arrays
    pred_path = os.path.join(output_dir, "predictions.npz")
    label_path = os.path.join(output_dir, "labels.npz")
    
    np.savez_compressed(pred_path, predictions=predictions)
    np.savez_compressed(label_path, labels=labels)
    
    print(f"Results saved to {output_dir}/")
    print(f"Predictions shape: {predictions.shape}")
    print(f"Labels shape: {labels.shape}")

In [15]:
save_results(predictions, labels, RESULTS_DIR)

Results saved to /shared/3/projects/hiatus/tagged_data/biber-aggregate/evaluation//
Predictions shape: (3960277, 96)
Labels shape: (3960277, 96)


In [16]:
accuracy = np.mean(predictions == labels)
micro_f1 = f1_score(labels, predictions, average='micro')
macro_f1 = f1_score(labels, predictions, average='macro')

print(f"\nOverall Metrics:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Micro F1: {micro_f1:.3f}")
print(f"Macro F1: {macro_f1:.3f}")


Overall Metrics:
Accuracy: 0.985
Micro F1: 0.985
Macro F1: 0.971


In [17]:
# Calculate per-tag metrics
per_tag_f1 = f1_score(labels, predictions, average=None)

# Create DataFrame with metrics
per_tag_metrics = pd.DataFrame({
    'Tag': tag_names,
    'F1': per_tag_f1,
    'Support': np.sum(labels, axis=0)  # Count occurrences of each tag
}).sort_values('F1', ascending=False)

print("\nTop 5 performing tags:")
print(per_tag_metrics.head().to_string(index=False))
print("\nBottom 5 performing tags:")
print(per_tag_metrics.tail(10).to_string(index=False))


Top 5 performing tags:
     Tag       F1   Support
  BIN_NN 0.999932 3959706.0
 BIN_CAP 0.999885 3947805.0
BIN_PREP 0.999781 3949092.0
 BIN_PIN 0.999781 3949092.0
 BIN_ART 0.999576 3927303.0

Bottom 5 performing tags:
      Tag       F1   Support
 BIN_EMOT 0.924797   92134.0
BIN_PRESP 0.921507 1075079.0
 BIN_THAC 0.920486  341645.0
BIN_WHOBJ 0.918361 1086660.0
   BIN_AT 0.916206   98066.0
   BIN_UH 0.912957 1042887.0
  BIN_GER 0.909906  745031.0
BIN_PASTP 0.878411  530723.0
 BIN_EMOJ 0.841737   10753.0
 BIN_DPAR 0.832298  281544.0


In [18]:
# Calculate mean and median F1 scores
mean_f1 = per_tag_metrics['F1'].mean()
median_f1 = per_tag_metrics['F1'].median()

print("\nOverall Tag Performance:")
print(f"Mean F1: {mean_f1:.3f}")
print(f"Median F1: {median_f1:.3f}")


Overall Tag Performance:
Mean F1: 0.971
Median F1: 0.980


In [19]:
# Print full table of results
print("\nAll tags performance metrics:")
print(per_tag_metrics.to_string(index=False))


All tags performance metrics:
       Tag       F1   Support
    BIN_NN 0.999932 3959706.0
   BIN_CAP 0.999885 3947805.0
  BIN_PREP 0.999781 3949092.0
   BIN_PIN 0.999781 3949092.0
   BIN_ART 0.999576 3927303.0
   BIN_DET 0.999457 3948204.0
  BIN_AUXB 0.999169 3819762.0
  BIN_CONJ 0.998942 3495470.0
  BIN_INDA 0.998848 3642094.0
   BIN_PRP 0.998823 3874095.0
  BIN_SBJP 0.998823 3874095.0
 BIN_CCONJ 0.998665 3813029.0
    BIN_JJ 0.997772 3926831.0
  BIN_PRIV 0.997652 3289775.0
  BIN_VPRT 0.997092 3874272.0
  BIN_TPP3 0.996948 2628001.0
    BIN_RB 0.996328 3829366.0
   BIN_PIT 0.996207 3075323.0
  BIN_CONT 0.995388 2550978.0
   BIN_INF 0.995191 3764495.0
    BIN_TO 0.995151 3295186.0
  BIN_PUBV 0.995094 2473490.0
    BIN_WH 0.994814 3246688.0
  BIN_QUAN 0.994628 2867377.0
 BIN_SCONJ 0.994600 3419624.0
  BIN_NOMZ 0.994553 2972320.0
  BIN_BEMA 0.994250 3420386.0
  BIN_PGAS 0.993453 3385617.0
  BIN_FPP1 0.993247 3175405.0
  BIN_PEAS 0.992497 3402485.0
   BIN_XX0 0.989690 2661517.0
  BIN_DEM