In [None]:
#!pip install transformers torch scikit-learn pandas

In [1]:
!pip install chardet
!pip install seqeval -q
!pip install nltk

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0


# Import the libraries / Modules

In [4]:
import pandas as pd
#import os
import numpy as np
from sklearn.model_selection import train_test_split
import gc
import json
import os
import random
import re
import warnings
from collections import defaultdict
from functools import partial
from typing import Dict

from sklearn.metrics import classification_report
from seqeval.metrics import f1_score, precision_score, recall_score
#from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AdamW,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    Trainer,
    TrainingArguments
)
import torch
from datasets import Dataset as HF_dataset
#from torch.utils.data import DataLoader, Dataset

# Set CFG and Seed

In [5]:
all_labels = [
    'Task',
    'Dataset',
    'Metric',
    'Score',
    "O"
]
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

print(id2label)

{0: 'Task', 1: 'Dataset', 2: 'Metric', 3: 'Score', 4: 'O'}


In [10]:
class Config:
    # debug
    debug = False

    # cross validation
    do_cv = True
    fold = 0
    n_splits = 2 if debug else 4

    # gpu
    gpu = torch.cuda.is_available()

    # seed
    seed = 42

    # negative sample frac
    neg_frac = 1 #0.3

    # external dataset
#     external_name = "tonyarobertson"
    external_name = "Last_epoch" # 
#     external_name = "mpware"
#     external_name = "valentin"
#     external_name = "moth"
#     external_name = "pjmathematician"
    
    #TODO adjust folders
    # directory path
    input_dir = "/kaggle/working"
    comp_dir = input_dir + "comp_dir"
    fold_dir = input_dir + "sota/dataset/train/"
    external_dir = input_dir + "external_dir"
    output_dir = "/kaggle/working/output/"

    # file path
    comp_path = comp_dir + "train.json"
    external_path = external_dir + "datamix.json"

    # tokenizer
    train_max_length = 512 #1536
    eval_max_length = 512 #3500
    train_stride = None
    eval_stride = 256

    # model
    model_name = "allenai/scibert_scivocab_uncased" #ALSO cased scibert
#     model_name = "microsoft/deberta-v3-base"
#     model_name = "microsoft/deberta-v3-large"
    num_train_epochs = 1 if debug else 3
    max_steps = 5 if debug else 3000
    fp16 = True if gpu else False
    per_device_train_batch_size = 1 
    gradient_accumulation_steps = 2 
    learning_rate = 2e-5
    warmup_ratio = 0.1
    weight_decay = 0.01

    # postprocessing
    threshold = 0.95

    # save path
    if train_stride is not None:
        save_path = f"{model_name.split('/')[-1]}-{external_name}-{train_max_length}-{train_stride}-{seed}"
    else:
        save_path = f"{model_name.split('/')[-1]}-{external_name}-{train_max_length}-{seed}"
    if do_cv:
        save_path = f"{save_path}-{fold}"

def fix_seed(seed):
    # basic
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

    # torch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_seed(Config.seed)

# load data

In [13]:
train_df.to_pickle("train_preprocessed_v2")

In [14]:
val_df.to_pickle("val_preprocessed_v2")

In [6]:
import os
import pandas as pd

# Example usage with raw string for folder_path

train_df = pd.read_pickle('/kaggle/input/sota-preprocessed-v3/train_preprocessed_v2')

train_df.head(3)

Unnamed: 0,document,full_text,labels_1,tokens,trailing_whitespace,token_labels,provided_labels,word_labels,input_ids,token_type_ids,attention_mask,offset_mapping,labels,length
0,1501.07800,"\documentclass{elsarticle}\n\usepackage{color,...",,"[\documentclass, {, elsarticle, }, \usepackage...","[False, False, False, True, False, False, Fals...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[102, 4088, 3367, 5717, 1342, 847, 22851, 2664...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 1), (1, 9), (9, 14), (14, 15), (1...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",512
1,1509.04927,"\documentclass[12pt,twoside,a4paper]{article}\...",,"[\documentclass, [, 12pt, ,, twoside, ,, a4pap...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[102, 4088, 3367, 5717, 260, 760, 489, 422, 50...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 1), (1, 9), (9, 14), (14, 15), (1...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",512
2,2204.01208v1,\begin{filecontents*}{example.eps}\ngsave\nnew...,[{'LEADERBOARD': {'Task': 'GZSL Video Classifi...,"[\begin, {, filecontents, *, }, {, example.eps...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, Score, Score, O...","[O, O, O, O, O, O, O, O, O, O, Score, Score, O...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[102, 4088, 3973, 1342, 4433, 13741, 30113, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(0, 0), (0, 1), (1, 6), (6, 7), (7, 11), (11,...","[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...",512


In [7]:
val_df = pd.read_pickle('/kaggle/input/sota-preprocessed-v3/val_preprocessed_v2')

# Modeling

In [11]:
tokenizer = AutoTokenizer.from_pretrained(Config.model_name)
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    output_dir=Config.output_dir, 
    fp16=Config.fp16,
    per_device_train_batch_size=Config.per_device_train_batch_size,
    gradient_accumulation_steps=Config.gradient_accumulation_steps,
    num_train_epochs=Config.num_train_epochs,
#     max_steps=Config.max_steps,
    learning_rate=Config.learning_rate,
    warmup_ratio=Config.warmup_ratio,
    weight_decay=Config.weight_decay,
#     group_by_length=True,
    #evaluation_strategy="no",
    evaluation_strategy='steps',
    save_strategy='steps',
    eval_steps=5 if Config.debug else 100,
    save_steps=5 if Config.debug else 100,
    logging_steps=0.05,
    #save_strategy="no",
    save_total_limit=300,
    lr_scheduler_type="cosine",
    metric_for_best_model="f1yue",
    load_best_model_at_end=True,
    report_to="none",
    seed=Config.seed,
)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [12]:
def freeze(module):
    for parameter in module.parameters():
        parameter.requires_grad = False


def compute_metrics(res, all_labels):
    predictions, labels = res
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 1) * recall * precision / (1 * precision + recall)

    return {"recall": recall, "precision": precision, "f1yue": f1_score}


def train(model_name, all_labels, id2label, label2id, ds, eval_ds, args, data_collator, tokenizer, save_path):
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=len(all_labels),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True,
    )

    # update initial weight （default: mean=0.0, std=0.02）
    model.classifier.weight.data.normal_(mean=0.0, std=0.01)

    #if model_name == "microsoft/deberta-v3-large":
    #    # freezing embeddings and first 4 layers of encoder
    #    freeze(model.deberta.embeddings)
    #    for i, layer in enumerate(model.deberta.encoder.layer[:4]):
    #        print(f"freeze layer {i+1} of encoder")
    #        freeze(layer)

    trainer = Trainer(
        model=model, 
        args=args, 
        train_dataset=ds,
        eval_dataset=eval_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=partial(compute_metrics, all_labels=all_labels)
    )
    trainer.train()
    trainer.save_model(save_path)

    del model, trainer
    torch.cuda.empty_cache()
    _ = gc.collect()

In [None]:
ds = HF_dataset.from_pandas(train_df, preserve_index=False)
ds_val = HF_dataset.from_pandas(val_df, preserve_index=False)

train(
    Config.model_name,
    all_labels,
    id2label,
    label2id,
    ds,
    ds_val,
    args,
    data_collator,
    tokenizer,
    Config.save_path,
)

tokenizer.save_pretrained(Config.save_path)


In [None]:
!ls -tl output

# DONE! If you don't want to use Trainer from Huggingface, you need to do it by your own.

## (Load data)

In [None]:
# pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json').shape

In [None]:
# Split the data into train and validation sets
#train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)

## (Build dataset)

In [None]:
'''class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['full_text']
        label = self.dataframe.iloc[idx]['document']
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)  # Convert label to tensor
        }'''

In [None]:
# Initialize the BERT tokenizer
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


In [None]:
'''# Define the maximum sequence length
max_length = 128
# batch size
batch_size = 16'''

In [None]:
#train_df.iloc[4750].full_text

In [None]:
#tokenizer(train_df.iloc[4750].tokens,add_special_tokens=False)

In [None]:
#tokenizer.tokenize(train_df.iloc[4750].full_text,)

In [None]:
#tokenizer.convert_ids_to_tokens(tokenizer(train_df.iloc[4750].full_text).input_ids)

## (Build dataloader)

In [None]:
'''# Create instances of the custom dataset class for training and validation
train_dataset = CustomDataset(train_df, tokenizer, max_length)
val_dataset = CustomDataset(val_df, tokenizer, max_length)'''

In [None]:
'''train_df['document'].max()'''

In [None]:
'''# Create data loaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)'''

## (Build model)

In [None]:
'''# Initialize the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df_train['document'].unique()))'''

## (Build optimizer and loss)

In [None]:
'''# optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define the loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)'''

## (set device)

In [None]:
'''# Define the device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")'''

In [None]:
'''model.to(device)'''

## (Define training process)

In [None]:
'''def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler=None):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in data_loader:  # Iterate over batches in the data loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)

        # Use ignore_index to handle potential unseen labels during training
        loss = loss_fn(outputs.logits, labels, ignore_index=-1)  # Set -1 as ignore_index (optional)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if scheduler:
            scheduler.step()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)'''

## (Define Evaliation)

In [None]:
'''# Define the evaluation function
def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:  # Iterate over batches in the data loader
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            loss = loss_fn(outputs.logits, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)'''

In [None]:
'''import torch.nn.functional as F'''

## (Train and eval)

In [None]:
'''# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_losses = []
    correct_predictions = 0
    total_samples = 0
  
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        print(input_ids.shape,labels.shape)
      
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        print(logits.shape)
        logits = outputs.logits
      
        # Calculate the loss
        loss = F.cross_entropy(logits, labels, ignore_index=-1)
      
        loss.backward()
        optimizer.step()
      
        train_losses.append(loss.item())
        correct_predictions += torch.sum(torch.argmax(logits, dim=1) == labels)
        total_samples += labels.size(0)
      
    train_accuracy = correct_predictions.double() / total_samples
    train_loss = sum(train_losses) / len(train_losses)
  
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Accuracy: {train_accuracy:.4f}, Train Loss: {train_loss:.4f}')'''