## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
! pip install transformers==4.28.0 datasets accelerate

import pandas as pd
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
!pip install datasets

from datasets import load_dataset

!pip install transformers 
! pip install sentencepiece

!pip install sacrebleu

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

##Load AGENDA

In [None]:
%cd /content/drive/MyDrive/AGENDA/agenda_preprocess/processed
with open("training-src.txt", "r") as file:
  train_source = file.readlines()
with open("training-tgt.txt", "r") as file:
  train_target = file.readlines()
with open("test-src.txt", "r") as file:
  test_source = file.readlines()
with open("test-tgt.txt", "r") as file:
  test_target = file.readlines()
with open("dev-src.txt", "r") as file:
  val_source = file.readlines()
with open("dev-tgt.txt", "r") as file:
  val_target = file.readlines()

/content/drive/MyDrive/AGENDA/agenda_preprocess/processed


In [None]:
train_df = pd.DataFrame(list(zip(train_source, train_target)),
                        columns=['source', 'target'])
val_df = pd.DataFrame(list(zip(val_source, val_target)),
                        columns=['source', 'target'])

test_df = pd.DataFrame(list(zip(test_source, test_target)),
                        columns=['source', 'target'])

In [None]:
from datasets import Dataset
train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)
test_data = Dataset.from_pandas(test_df)
MAX_TOKENS = 225

def tokenize_function(examples):
    output = tokenizer(examples["source"], add_special_tokens=True, max_length=MAX_TOKENS, 
                       truncation=True,padding='max_length')
    
    output['labels'] = output["input_ids"]
    # -100 is a reserved value to ignore these tokens when calculating the loss
    output["labels"] = [[-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]]
    return output

def tokenize_function_2(examples):
    output = tokenizer(examples["target"], add_special_tokens=True, max_length=MAX_TOKENS, 
                        truncation=True,padding='max_length')
    
    output['labels'] = output["input_ids"]
    # -100 is a reserved value to ignore these tokens when calculating the loss
    output["labels"] = [[-100 if x == tokenizer.pad_token_id else x for x in y] for y in output["labels"]]
    return output

train_data = train_data.map(
    tokenize_function,
    batched=True).map(
    tokenize_function_2,
    batched=True)

val_data = val_data.map(
    tokenize_function,
    batched=True).map(
    tokenize_function_2,
    batched=True)

test_data = test_data.map(
    tokenize_function_2,
    batched=True).map(
    tokenize_function_2,
    batched=True)

Map:   0%|          | 0/38720 [00:00<?, ? examples/s]

Map:   0%|          | 0/38720 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
test_data

Dataset({
    features: ['source', 'target', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [None]:
test_data[0]['source']

'<TITLE> hierarchical semantic classification : word sense disambiguation with world knowledge . <H> learning architecture <R> USED-FOR <T> lexical semantic classification problems\n'

## Train with PyTorch Trainer

In [None]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("bert-base-cased")

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
%cd /content/drive/MyDrive/AGENDA/AGENDA_train
from transformers import TrainingArguments, Trainer

model.resize_token_embeddings(len(tokenizer))
# Note the batch size of 4 to make sure we have multiple steps per epoch. This generally speeds up training
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/AGENDA/AGENDA_train/test1", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=10, # batch size for training
    per_device_eval_batch_size=10,  # batch size for evaluation
    load_best_model_at_end=True,
    warmup_steps=len(train_data) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.0,
    #max_steps = 200,
    learning_rate = 5e-4,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


/content/drive/MyDrive/AGENDA/AGENDA_train


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: source, target. If source, target are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 38,720
  Num Epochs = 2
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 7,744
  Number of trainable parameters = 108,340,804


Epoch,Training Loss,Validation Loss
1,6.6244,6.705758
2,6.7557,6.691794


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: source, target. If source, target are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 10


Saving model checkpoint to /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkpoint-3872
Configuration saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkpoint-3872/config.json
Configuration saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkpoint-3872/generation_config.json
Model weights saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkpoint-3872/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: source, target. If source, target are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 10
Saving model checkpoint to /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkpoint-7744
Configuration saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkpoint-7744/config.json
Configuration saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/checkp

TrainOutput(global_step=7744, training_loss=3.825598368722841, metrics={'train_runtime': 3897.2865, 'train_samples_per_second': 19.87, 'train_steps_per_second': 1.987, 'total_flos': 8957029251456000.0, 'train_loss': 3.825598368722841, 'epoch': 2.0})

In [None]:
trainer.save_model()

Saving model checkpoint to /content/drive/MyDrive/AGENDA/AGENDA_train/test1
Configuration saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/config.json
Configuration saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/generation_config.json
Model weights saved in /content/drive/MyDrive/AGENDA/AGENDA_train/test1/pytorch_model.bin


### Evaluate

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: source, target. If source, target are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 10


{'eval_loss': 6.691793918609619,
 'eval_runtime': 15.2366,
 'eval_samples_per_second': 65.632,
 'eval_steps_per_second': 6.563,
 'epoch': 2.0}