<a href="https://colab.research.google.com/github/chizuchizu/IOAI/blob/main/Task2/task2_complete_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# !cp /content/drive/MyDrive/fasttext_model.bin .

Mounted at /content/drive


In [24]:
# ====================================================
# CFG
# ====================================================

class CFG:
    num_workers = 4
    project = "IOAI_Task2_classification"
    name = "009_redrock_complete_task2_cbow_lr_5e5"

    base_model_name = "google-bert/bert-base-multilingual-uncased"
    base_tokenizer_name = "google-bert/bert-base-multilingual-uncased"

    # None -> training, otherwise -> load the model
    pretrained_model_name = None  # "ioai2024japan/fast_chizu_010_task2_complete_pretrain"

    fasttext_path = "/content/drive/MyDrive/fasttext_2_skip.bin"

    dropout = 0.3

    num_classes = 5

    # training
    pretrain_epochs = 2
    grid_epochs = 20
    classification_epochs = 35
    mlm_probability = 0.15

    if_grid = False
    scheduler='cosine_with_warmup' # ["linear", "cosine", "linear_with_warmup", "cosine_with_warmup"]
    pretrain_lr = 5e-5
    finetune_lr = 2e-5
    eps = 1e-8

    # dataset
    max_length = 256

    pretrain_train_batch_size = 32
    pretrain_eval_batch_size = 32

    finetune_train_batch_size = 64
    finetune_eval_batch_size = 64

    seed=42
    train=True

    pseudo_size = 60000
    pseudo_select_size = 1500

    if_wandb = True

    if_fasttext_tokenizer = True

# for wandb
cfg = dict(vars(CFG))
cfg = {k: v for k, v in cfg.items() if "__" not in k}

In [2]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write_tt')

import importlib

import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U

if importlib.util.find_spec('wandb') is None:
  !pip install wandb -q

if importlib.util.find_spec('fasttext') is None:
  !pip install fasttext -q

import os
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.functional import F
import torch.cuda.amp as amp

import torchvision
from torchvision import datasets, transforms, models

from tqdm.auto import tqdm
from transformers import DataCollatorForLanguageModeling

from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, get_scheduler, BertForMaskedLM, BertTokenizer, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, BertConfig

from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets

import evaluate

import wandb

import fasttext

from joblib import Parallel, delayed

import logging
logging.basicConfig(level=logging.DEBUG)

from huggingface_hub import login

if CFG.if_wandb:
    wandb.login(key=userdata.get('wandb_token'))

login(token=read_access_token)

f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

def to_device(batch, device):
    output = {}
    for k, v in batch.items():
        try:
            output[k] = v.to(device)
        except:
            output[k] = v
    return output


[34m[1mwandb[0m: Currently logged in as: [33mtoukyou-tochiji[0m ([33mtoukyou-tochiji-[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

seed_everything(CFG.seed)

In [4]:
def train_tokenizer(raw_dataset):
    train_corpus = []

    num_cores = 8

    train_corpus = [text for text in raw_dataset['train']["text"]]

    base_tokenizer = AutoTokenizer.from_pretrained(CFG.base_tokenizer_name)

    tokenizer = base_tokenizer.train_new_from_iterator(train_corpus, base_tokenizer.vocab_size)

    return tokenizer


In [5]:
def train_one_epoch(model, scheduler, train_loader, optimizer, if_grid, fp16=False):
    model.train()
    running_loss = 0.0
    progress_bar = tqdm(train_loader, dynamic_ncols=True, leave=(not if_grid))
    scaler = torch.cuda.amp.GradScaler()

    for step, batch in enumerate(progress_bar):
        batch = to_device(batch, "cuda")

        if fp16:
            with amp.autocast():
                outputs = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    token_type_ids=batch["token_type_ids"],
                    labels=batch["labels"],
                )
                loss = outputs.loss

            # Scale loss for fp16 training
            scaler.scale(loss).backward()

            # Optimizer step with gradient scaling
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        else:
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        if CFG.if_wandb and (not if_grid):
            wandb.log(
                {
                    "train_loss": loss,
                    "lr": optimizer.param_groups[0]["lr"],
                    "step": step,
                }
            )

        text = f"step {step}, loss: {loss:.5f}"
        progress_bar.set_description(text)

def evaluate_model(model, eval_loader):
    model.eval()
    predictions = []
    labels = []
    for batch in eval_loader:
        batch = to_device(batch, "cuda")
        with torch.no_grad():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
                labels=batch["labels"],
            )

        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1)
        predictions.append(prediction.cpu().numpy())
        labels.append(batch["labels"].cpu().numpy())

    predictions = np.concatenate(predictions)
    labels = np.concatenate(labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average='macro')
    return f1_score, predictions

def test_model(model, eval_loader):
    model.eval()
    predictions = []
    labels = []
    for batch in eval_loader:
        batch = to_device(batch, "cuda")
        with torch.no_grad():
            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                token_type_ids=batch["token_type_ids"],
            )

        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1)
        predictions.append(prediction.cpu().numpy())

    predictions = np.concatenate(predictions)
    return predictions

In [6]:
def pretrain(raw_dataset, fasttext_model, tokenizer, transform_raw, fp16=True):
    print("=== Pretrain ===")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=CFG.mlm_probability
    )

    tokenized_data = raw_dataset.with_transform(transform_raw)

    train_dataset = tokenized_data["train"]

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.pretrain_train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=False,
        collate_fn=data_collator,
    )

    model = BertForMaskedLM.from_pretrained(
        CFG.base_model_name
    )

    words = tokenizer.vocab
    fasttext_embedding = model.bert.embeddings.word_embeddings.weight
    with torch.no_grad():
        for word, i in words.items():  # key: word, value: i
            fasttext_embedding[i] = torch.tensor(fasttext_model.get_word_vector(word))
        model.bert.embeddings.word_embeddings.weight = fasttext_embedding

    model.cuda()

    num_training_steps = CFG.pretrain_epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=CFG.pretrain_lr, eps=1e-08, betas=(0.9, 0.999))
    if CFG.scheduler == 'cosine_with_warmup':
        scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    elif CFG.scheduler == 'linear_with_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    else:
        scheduler = get_scheduler(name=scheduler, optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    for i in range(CFG.pretrain_epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer, False, fp16)
        print(f'Epoch {i+1}')

    model_path = f"{CFG.name}_pretrain"
    model.save_pretrained(model_path)
    return model_path

In [12]:
def finetune(base_model, train_dataset, eval_dataset, device, lr, scheduler, eps, fp16=True):
    print("=== Finetune ===")

    config = BertConfig.from_pretrained(base_model, token=userdata.get("hf_read_tt"), hidden_dropout_prob=CFG.dropout, attention_probs_dropout_prob=CFG.dropout, num_labels=CFG.num_classes)

    model = AutoModelForSequenceClassification.from_pretrained(
        base_model, config=config, token=userdata.get('hf_read_tt'),
    ).cuda()

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.finetune_train_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True,
        drop_last=True,
    )

    eval_loader = DataLoader(
        eval_dataset,
        batch_size=CFG.finetune_eval_batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=False,
        drop_last=False,
    )

    num_training_steps = CFG.classification_epochs * len(train_loader)
    optimizer = optim.AdamW(model.parameters(), lr=lr, eps=eps, betas=(0.9, 0.999))

    if scheduler == 'cosine_with_warmup':
        scheduler = get_cosine_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    elif scheduler == 'linear_with_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_training_steps/10, num_training_steps=num_training_steps)
    else:
        scheduler = get_scheduler(name=scheduler, optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    best_f1 = 0.0
    best_model_path = None

    model.to(device)
    for i in range(CFG.classification_epochs):
        train_one_epoch(model, scheduler, train_loader, optimizer, False)
        f1_score, _ = evaluate_model(model, eval_loader)
        f1_score = f1_score["f1"]

        if f1_score > best_f1:
            best_f1 = f1_score
            best_model_path = f"{CFG.name}_finetune_epoch_{i+1}"
            model.save_pretrained(best_model_path)


        if CFG.if_wandb:
            wandb.log(
                {
                    "epoch": i+1,
                    "f1": f1_score
                }
            )
        print(f'Epoch {i+1} {f1_score}')

    # model_path = f"{CFG.name}_finetune"
    # model.save_pretrained(model_path)
    return best_model_path, best_f1

In [13]:
def up_to_hub(model_name, model_path, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=CFG.num_classes
    )
    model.push_to_hub(
        f"***/{model_name}",
        token=write_access_token, private=True
    )
    tokenizer.push_to_hub(
        f"***/{model_name}",
        token=write_access_token, private=True
    )

In [14]:
def grid_search(grid_search_params, base_model, train_dataset, eval_dataset, device):
    print("=== GRID SEARCH===")

    best_f1 = 0.0

    config = BertConfig.from_pretrained(base_model, hidden_dropout_prob=CFG.dropout, attention_probs_dropout_prob=CFG.dropout, num_labels=CFG.num_classes, token=userdata.get('hf_read_tt'))

    for lr in grid_search_params[0]:
        print(f"lr: {lr}")
        for scheduler in grid_search_params[1]:
          print(f"scheduler: {scheduler}")
          for eps in grid_search_params[2]:
              _, f1_score = finetune(base_model, train_dataset, eval_dataset, device, lr=lr, scheduler=scheduler, config=config, eps=eps)
              print(f'eps: {eps}, f1: {f1_score}')
              if f1_score > best_f1:
                  best_f1 = f1_score
                  best_scheduler = scheduler
                  best_lr = lr
                  best_eps = eps

    return best_lr, best_scheduler, best_eps

In [23]:
def main():
    raw_dataset = load_dataset('InternationalOlympiadAI/NLP_problem_raw', token=read_access_token)
    classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    fasttext_model = None

    if CFG.pretrained_model_name is not None:
        tokenizer = AutoTokenizer.from_pretrained(CFG.pretrained_model_name, token=userdata.get('hf_read_tt'))

    else:
        tokenizer = train_tokenizer(raw_dataset)
        # Train tokenizer to Lang X
        if CFG.if_fasttext_tokenizer:
            if os.path.exists(CFG.fasttext_path):
                fasttext_model = fasttext.load_model(CFG.fasttext_path)
            else:
                # switch model embeddings with FastText
                with open('file.txt', 'w') as f:
                    for line in raw_dataset['train']["text"]:
                        f.write(f"{line}\n")
                corpus_file = 'file.txt'
                fasttext_model = fasttext.train_unsupervised(corpus_file, dim=768, minCount=2)

                fasttext_model.save_model("fasttext.bin")
                # fasttext_model = fasttext.load_mo


    # for raw set
    def transform_raw(example_batch):
        # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        return inputs

    # for problem set
    def transform(example_batch):
        # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
        inputs["labels"] = example_batch["label"]
        return inputs

    tokenized_data = classification_dataset.with_transform(transform)

    train_dataset = tokenized_data["train"]
    eval_dataset = tokenized_data["dev"]

    # Continual Pre-Training of MLM
    if CFG.pretrained_model_name is None:
        if CFG.if_wandb:
            wandb.init(
                name=CFG.name,
                project="IOAI_Task2_pretrain",
                config=cfg
            )
        pretrained_model_path = pretrain(raw_dataset, fasttext_model, tokenizer, transform_raw, fp16=True)
        up_to_hub(f"{CFG.name}_pretrain", pretrained_model_path, tokenizer)
    else:
        pretrained_model_path = CFG.pretrained_model_name

    if CFG.if_wandb:
        wandb.init(
            name=CFG.name,
            project="IOAI_Task2_finetune",
            config=cfg
        )

    # Finetune with normal dataset

    grid_search_params = [
        [1e-5, 2e-5],
        ['cosine', 'cosine_with_warmup'],
        [1e-08, 1e-06]
    ]

    if CFG.if_grid:
        best_lr, best_scheduler, best_eps = grid_search(grid_search_params, pretrained_model_path, train_dataset, eval_dataset, device)
    else:
        best_lr = CFG.finetune_lr
        best_scheduler = CFG.scheduler
        best_eps = CFG.eps

    finetuned_model_path, _ = finetune(pretrained_model_path, train_dataset, eval_dataset, device, best_lr, best_scheduler, best_eps)

    if CFG.if_wandb:
        wandb.finish()


    return finetuned_model_path, tokenizer

In [25]:
final_model_name, tokenizer = main()

=== Finetune ===




  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 1 0.23857252896089043


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 2 0.5220474988675339


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 3 0.6508242886689855


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 4 0.8907674884627644


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 5 0.9015328257568689


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 6 0.8922469635993119


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 7 0.8967462644881999


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 8 0.8758270271210471


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 9 0.9034528477168603


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 10 0.8775831541964265


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 11 0.8991636289394558


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 12 0.893630286806036


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 13 0.8931982676717446


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 14 0.9025149926738502


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 15 0.9022473861970621


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 16 0.9044209727060798


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 17 0.9034206283348809


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 18 0.898813729387439


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 19 0.8785461145646414


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 20 0.8783818011496567


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 21 0.8926896304469268


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 22 0.8883340564579807


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 23 0.883054305353957


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 24 0.8881040382297767


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 25 0.8883721849486108


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 26 0.8985341425057737


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 27 0.8988793180768004


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 28 0.8995285148413792


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 29 0.8836988615249485


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 30 0.8937229532868451


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 31 0.8995285148413792


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 32 0.89428574452147


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 33 0.89428574452147


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 34 0.89428574452147


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 35 0.8897052941167469


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
f1,▁▄▅████████████████████████████████
lr,▂▃▅▇██████▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁
step,▃▁▁▆▇▅▅▃▃█▁▆▆▇▅▅▃▃█▁▆▆▄▅▂▃▃█▁▆▆▄▅▂▃██▆▆▆
train_loss,██▇▅▃▃▂▂▂▂▁▂▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,35.0
f1,0.88971
lr,0.0
step,22.0
train_loss,0.0202


In [None]:
up_to_hub(CFG.name, final_model_name, tokenizer)

In [None]:
def terminate_session():
    # Terminate this session

    from google.colab import runtime
    runtime.unassign()

terminate_session()

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/2.43M [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    final_model_name, num_labels=CFG.num_classes
)

In [None]:
# run the trained model on a dev/test split
classification_dataset = load_dataset('InternationalOlympiadAI/NLP_problem', token=read_access_token)

def transform_raw(example_batch):
    # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
    inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
    return inputs

data_split = "dev"
tokenized_data = classification_dataset.with_transform(transform_raw)
test_dataset = tokenized_data[data_split]

test_loader = DataLoader(
    test_dataset,
    batch_size=CFG.finetune_eval_batch_size,
    num_workers=0,
    pin_memory=True,
    shuffle=False,
    drop_last=False,
)
model.cuda()

In [None]:
predictions = test_model(model, test_loader)

In [None]:
# write the predictions to a file
with open('{}_predictions.txt'.format(data_split), 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions.tolist()]))

In [None]:
predictions

In [None]:
# UPDATE THIS CELL ACCORDINGLY

# define a funciton to load your tokenizer and model from a HF path
# the path variables can be strings or lists of strings (for ensemble solutions)
def load_model(path_to_tokenizer, path_to_model, token):
  # Example:
  tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer, token=token)
  model = AutoModelForSequenceClassification.from_pretrained(path_to_model, token=token)
  model.eval()

  return tokenizer, model

# define a "predict" function that takes the model and a list of input strings
# and returns the outputs as a list of integer classes
def predict(tokenizer, model, input_texts):
  #Example:
  predictions = []
  for input_text in input_texts:

    # input_ids = tokenizer(input_text, return_tensors="pt")
    # devanagari_text = transliterate_brahmi_to_devanagari(input_text)
    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=CFG.max_length, padding="max_length")
        # example_batch["text"] = [transliterate_brahmi_to_devanagari(x) for x in example_batch["text"]]
        # inputs = tokenizer([x for x in example_batch["text"]],  truncation=True, max_length=CFG.max_length, padding="max_length", return_tensors="pt")
    with torch.no_grad():
      logits = model(**input_ids).logits

    predictions.append(logits.argmax().item())

  return predictions


# set variables
path_to_model = "ioai2024japan/chizu_010_task2_complete" # can be a list instead
path_to_tokenizer = "ioai2024japan/chizu_010_task2_complete" # can be a list instead
model_access_token = read_access_token # a fine-grained token with read rights for your model repository
data_split = "test"

In [None]:
# DO NOT CHANGE THIS CELL!!!

tokenizer, model = load_model(path_to_model, path_to_tokenizer, token=model_access_token)

test_data = load_dataset("InternationalOlympiadAI/NLP_problem_test")['test']['text']

predictions = predict(tokenizer, model, test_data)

with open('{}_predictions.txt'.format(data_split), 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions]))