In [1]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from transformers import RobertaModel, RobertaTokenizer

import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import transformers
from transformers import (
    HfArgumentParser, Seq2SeqTrainingArguments, AutoConfig, AutoTokenizer, 
    AutoModelForSequenceClassification, Seq2SeqTrainer
)

import json
from tqdm import tqdm
from pathlib import Path

from dataset import ClaimsData
from arguments import ModelArguments, DataTrainingArguments, EvalArguments

writer = SummaryWriter("runs/exp1")

## Parameters

In [2]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 8
# EPOCHS = 1
LEARNING_RATE = 1e-05

## Components

In [3]:
## Loading Components
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, EvalArguments, Seq2SeqTrainingArguments))
model_args, data_args, eval_args, training_args = parser.parse_json_file(json_file="train.json")

config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name,
    cache_dir=model_args.cache_dir,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name,
    use_fast=False,
    cache_dir=model_args.cache_dir,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name,
    config=config,
    cache_dir=model_args.cache_dir,
)
_ = model.train()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

## Data

In [4]:
## Reading data
data = pd.read_csv("../datasets/cards_waterloo.csv", low_memory=False)

train_dataset = ClaimsData(data[data["PARTITION"] == "TRAIN"].reset_index(), tokenizer, MAX_LEN)
valid_dataset = ClaimsData(data[data["PARTITION"] == "VALID"].reset_index(), tokenizer, MAX_LEN)
test_dataset = ClaimsData(data[data["PARTITION"] == "TEST"].reset_index(), tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
valid_params = {'batch_size': VALID_BATCH_SIZE, 'shuffle': True, 'num_workers': 0}
test_params = {'batch_size': TEST_BATCH_SIZE, 'shuffle': False, 'num_workers': 0}

In [5]:
# train_loader = DataLoader(train_dataset, **train_params, collate_fn=collate_fn)
# valid_loader = DataLoader(valid_dataset, **valid_params)
# test_dataset = DataLoader(test_dataset, **valid_params)

In [7]:
len(train_dataset[0]["input_ids"])

256

In [8]:
# Training
from transformers import Trainer

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer
)

trainer.train()

***** Running training *****
  Num examples = 58590
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 18310
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mcrarojasca[0m ([33msoloteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [6]:
from dataclasses import dataclass, field
from typing import Optional


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    task_type: str = field(
        metadata={"help": "Task type, can be either generation or classification"}
    )
    num_labels: str = field(
        metadata={"help": "Number of labels, used for sequence classification"}
    )
    mode: str = field(
        metadata={"help": "mode, can be either train or test"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."})
    freeze_embeds: bool = field(default=False, metadata={"help": "Whether  to freeze the embeddings."})

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
    )
    test_type: Optional[str] = field(
        default="test", metadata={"help": "The type_path of the test file, test.seen, test.unseen etc."}
    )
    task: Optional[str] = field(
        default="summarization",
        metadata={"help": "Task name, summarization (or summarization_{dataset} for pegasus) or translation"},
    )
    max_source_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_target_length: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    val_max_target_length: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded. "
            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
            "during ``evaluate`` and ``predict``."
        },
    )
    test_max_target_length: Optional[int] = field(
        default=300,
        metadata={
            "help": "The maximum total sequence length for test target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    n_train: Optional[int] = field(default=None, metadata={"help": "# training examples. None means use all."})
    n_val: Optional[int] = field(default=None, metadata={"help": "# validation examples. None means use all."})
    n_test: Optional[int] = field(default=None, metadata={"help": "# test examples. None means use all."})
    eval_beams: Optional[int] = field(default=None, metadata={"help": "# num_beams to use for evaluation."})
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={"help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined."},
    )

@dataclass
class EvalArguments:
    """
    Arguments pertaining to the evaluation of the model.
    """

    decode: Optional[str] = field(
        default='beam_search', metadata={"help": "Decoding method used, take in value of beam_search, nucleus"}
    )
    metric: Optional[str] = field(
        default='bleu', metadata={"help": "The metric used to evaluate the model, takes in value of bleu, rouge, meteor etc"}
    )
    compute_metric: Optional[bool] = field(
        default=False, metadata={"help": "whether to compute metrics while generating the outputs, must be False if num_samples > 1"}
    )
    num_beams: Optional[int] = field(
        default=5, metadata={"help": "beam size used to decode"}
    )
    num_samples: Optional[int] = field(
        default=1, metadata={"help": "Number of decoded sequence for each input"}
    )


In [7]:
from transformers import HfArgumentParser, Seq2SeqTrainingArguments

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, EvalArguments, Seq2SeqTrainingArguments))
model_args, data_args, eval_args, training_args = parser.parse_json_file(json_file="train.json")

NVIDIA A40 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70 sm_75.
If you want to use the NVIDIA A40 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [8]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name,
    cache_dir=model_args.cache_dir,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name,
    cache_dir=model_args.cache_dir,
)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name,
    config=config,
    cache_dir=model_args.cache_dir
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [10]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
#         data_collator=Seq2SeqDataCollator(tokenizer, config.decoder_start_token_id,model_args.task_type, model_args.mode, data_args),
        tokenizer=tokenizer,
)

In [11]:
# from torch import nn

# def freeze_params(model: nn.Module):
#     """Set requires_grad=False for each of model.parameters()"""
#     for par in model.parameters():
#         par.requires_grad = False

# freeze_params(model.get_encoder())

In [12]:
trainer.train()
trainer.save_model(Path(training_args.output_dir).joinpath("best-epoch"))#save best epoch

***** Running training *****
  Num examples = 58590
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 18310
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mcrarojasca[0m ([33msoloteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


RuntimeError: CUDA error: no kernel image is available for execution on the device

In [None]:
class Logger:
    
    def __init__(self, evaluate):
        self.partition = "Test" if evaluate else "Train"
        

    def log(self, step, name, metric):
        metric = {str(i):value for i, value in enumerate(metric)}
        writer.add_scalars(f"{self.partition} {name}", metric, step)

In [18]:
from datasets import load_dataset

In [19]:
raw_datasets = load_dataset("glue", "mrpc")

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /home/crojasca/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /home/crojasca/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})