# FoRC Task 1, Subtask A

- finetuned approach with custom TwinBERT Model
- with enrichments (S2AG, OpenAlex, CrossRef)

# 0. Imports and Setup

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install -U transformers

In [None]:
# import packages
import torch
import numpy as np
import pandas as pd
import pyarrow as pa
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertConfig,
    get_scheduler,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
# import custom TwinBERTModel
from twinbert import TwinBertForSequenceClassification

In [None]:
# device agnostic code
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Cuda-Device: {device}")

# 1. Data preparation

## 1.1 Load datasets

In [None]:
def load_dataset(f: Path, features: dict):
    data = pd.read_csv(f)
    X = data[features]
    return X

In [None]:
# define paths
path_data = Path("../datasets")

# Load cleaned/preprocessed datasets (enriched)
f_train = path_data / "train_cleaned_enriched.csv"
f_val = path_data / "val_cleaned_enriched.csv"

# define data mapping
features = [
    "abstract",
    "title",
    "label",
    "doi_canon",
    "concepts",
    "topics",
    "subtopics",
    "fos",
    "crossref_categories",
    "crossref_journal_title",
]

df_train = load_dataset(f_train, features)
df_val = load_dataset(f_val, features)

print(f"Train has {len(df_train)} samples")
print(f"Validation has {len(df_val)} samples")

## 1.2 Clean and prepare datasets

In [None]:
def count_nan(df):
    df_nan = df[df.isna().any(axis=1)]
    return len(df_nan)


print(f"Before cleaning:")
print(f"train-samples with NaN:{count_nan(df_train)}")
print(f"val-samples with NaN:{count_nan(df_val)}")

# remove nan values (inplace) with emptry string (we only have string values here)
df_train.fillna("", inplace=True)
df_val.fillna("", inplace=True)

print(f"\nAfter cleaning:")
print(f"train-samples with NaN:{count_nan(df_train)}")
print(f"val-samples with NaN:{count_nan(df_train)}")

In [None]:
df_train.shape

In [None]:
# encode labels to numbers
le = LabelEncoder()
le.fit(df_train["label"])
df_train["labels"] = le.transform(df_train["label"])
df_val["labels"] = le.transform(df_val["label"])
df_train["labels"][:5], df_val["labels"][:5]

In [None]:
# create tokenizer (pretrained for scientific papers)
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")

In [None]:
# Show label and label id
df_train[["label", "labels"]]

In [None]:
df_train.keys()

In [None]:
df_train.keys()

# Prepare BERT Model 1 input: Title+Abstract
# Prepare BERT Model 2 input: Enrichments

df_train["text_1"] = df_train["title"] + tokenizer.sep_token + df_train["abstract"]
df_train["text_2"] = (
    "Fields Of Research: "
    + df_train["fos"]
    + tokenizer.sep_token
    + "Topics: "
    + df_train["topics"]
    + tokenizer.sep_token
    + "Concepts: "
    + df_train["concepts"]
    + tokenizer.sep_token
    + "Subtopics: "
    + df_train["subtopics"]
    + tokenizer.sep_token
    + "Journal Title: "
    + df_train["crossref_journal_title"]
    + tokenizer.sep_token
    + "Categories: "
    + df_train["crossref_categories"]
    + tokenizer.sep_token
)

df_val["text_1"] = df_val["title"] + tokenizer.sep_token + df_val["abstract"]
df_val["text_2"] = (
    "Fields Of Research: "
    + df_val["fos"]
    + tokenizer.sep_token
    + "Topics: "
    + df_val["topics"]
    + tokenizer.sep_token
    + "Concepts: "
    + df_val["concepts"]
    + tokenizer.sep_token
    + "Subtopics: "
    + df_val["subtopics"]
    + tokenizer.sep_token
    + "Journal Title: "
    + df_val["crossref_journal_title"]
    + tokenizer.sep_token
    + "Categories: "
    + df_val["crossref_categories"]
    + tokenizer.sep_token
)

In [None]:
# Remove other columns (not needed right now)
df_train = df_train[["labels", "text_1", "text_2"]]
df_val = df_val[["labels", "text_1", "text_2"]]

In [None]:
# Create Dataset (train)
ds_train = Dataset(pa.Table.from_pandas(df_train))
ds_train

In [None]:
# Create Dataset (validation)
ds_val = Dataset(pa.Table.from_pandas(df_val))
ds_val

In [None]:
# Create DatasetDict
dd = DatasetDict({"train": ds_train, "validation": ds_val})
dd

In [None]:
# tokenize function
def tokenize_text_1(row):
    tok_output_1 = tokenizer(row["text_1"], padding=True, truncation=True, max_length=512)
    tok_output_2 = tokenizer(row["text_2"], padding=True, truncation=True, max_length=512)

    # Rename keys in the tokenized_output dictionary
    tok_output = {
        "input_ids_1": tok_output_1["input_ids"],
        "token_type_ids_1": tok_output_1["token_type_ids"],
        "attention_mask_1": tok_output_1["attention_mask"],
        "input_ids_2": tok_output_2["input_ids"],
        "token_type_ids_2": tok_output_2["token_type_ids"],
        "attention_mask_2": tok_output_2["attention_mask"],
    }

    # Return the renamed dictionary
    return tok_output

In [None]:
# apply tokenization in dataset
dd_tokenized = dd.map(tokenize_text_1, batched=True, batch_size=42000)

In [None]:
# set format to torch
dd_tokenized.set_format("torch")

In [None]:
ds_train = dd_tokenized["train"]
ds_val = dd_tokenized["validation"]

In [None]:
dd_tokenized

# 2. Training

# 2.1 Build model and prepare dataset for iteration

In [None]:
# Create the model (here a custom TwinBERT with DenseLayer on top)
# Note: For more than 2 Labels the standard loss_fn is CrossEntropyLoss()
# Adjust number of hidden layers according to your needs
model = TwinBertForSequenceClassification.from_pretrained("allenai/specter2_base", num_labels=123, num_hidden_layers=6)

## 2.2 Setup loss and optimizer


In [None]:
# Send model to device
model.to(device)

## 2.3 Train model

In [None]:
# Metrics for evaluation
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="weighted")
    precision = precision_score(y_true=labels, y_pred=pred, average="weighted")
    f1 = f1_score(y_true=labels, y_pred=pred, average="weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Train with trainer

# Define Trainer
args = TrainingArguments(
    output_dir="",  # add output directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=1e-4,
    weight_decay=0.001,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## 2.4 Write model

In [None]:
trainer.save_model("")  # add save path for model

# 3. Predict

## 3.1 Load test file

In [None]:
# create validation dataloader
f_val = path_data / "test_cleaned_enriched.csv"
features = cols = [
    "data_index",
    "title",
    "concepts",
    "topics",
    "subtopics",
    "fos",
    "crossref_journal_title",
    "crossref_categories",
    "abstract",
]
df_test = pd.read_csv(f_val)
df_test = df_test[features]

In [None]:
df_test.keys()

## 3.2 Create predictions

In [None]:
# remove nan from abstracts
df_test.fillna("", inplace=True)
# Prepare BERT text input

df_test["text_1"] = df_test["title"] + tokenizer.sep_token + df_test["abstract"]
df_test["text_2"] = (
    "Fields Of Research: "
    + df_test["fos"]
    + tokenizer.sep_token
    + "Topics: "
    + df_test["topics"]
    + tokenizer.sep_token
    + "Concepts: "
    + df_test["concepts"]
    + tokenizer.sep_token
    + "Subtopics: "
    + df_test["subtopics"]
    + tokenizer.sep_token
    + "Journal Title: "
    + df_test["crossref_journal_title"]
    + tokenizer.sep_token
    + "Categories: "
    + df_test["crossref_categories"]
    + tokenizer.sep_token
)

In [None]:
test_data = df_test[["text_1", "text_2"]]

## 3.3 Convert to HF dataset and make DatasetDict

In [None]:
test_dataset = Dataset(pa.Table.from_pandas(test_data))
dd_test = DatasetDict({"test": test_dataset})

In [None]:
# Tokenize
test_tokenized = dd_test.map(tokenize_text_1, batched=True)

In [None]:
test_tokenized

In [None]:
# remove unnecessary columns from dataset
test_tokenized = test_tokenized.remove_columns(["text_1", "text_2"])

In [None]:
test_tokenized.set_format("torch")

In [None]:
test_dataloader = DataLoader(test_tokenized["test"], shuffle=False, batch_size=8)

## 3.4 Write predictions to file

In [None]:
# eval loop
test_preds = []

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    test_preds.append(predictions)

In [None]:
test_preds_flat = [int(item) for items in test_preds for item in items]

In [None]:
test_preds_text = [le.inverse_transform([pred])[0] for pred in test_preds_flat]

In [None]:
test_preds_text

In [None]:
f_val = ""  # add where to store predictions in csv-format
df_test["target"] = test_preds_text
df_test[["data_index", "target"]].to_csv(f_val)

In [None]:
df_test