In [1]:
from dataclasses import dataclass, field
from transformers.models.electra.modeling_electra import ElectraClassificationHead
from transformers.trainer_utils import EvaluationStrategy
from typing import Optional
import sys
import os
from logical_fallacy.codes_for_models.finetune.util import *
from logical_fallacy.codes_for_models.finetune.evaluate import *
from pathlib import Path

from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoModelForMultipleChoice,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    ElectraTokenizerFast,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [2]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    task_type: str = field(
        metadata={"help": "Task type, can be either generation or classification"}
    )
    num_labels: str = field(
        metadata={"help": "Number of labels, used for sequence classification"}
    )
    mode: str = field(
        metadata={"help": "mode, can be either train or test"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."})
    freeze_embeds: bool = field(default=False, metadata={"help": "Whether  to freeze the embeddings."})
        

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
    )
    test_type: Optional[str] = field(
        default="test", metadata={"help": "The type_path of the test file, test.seen, test.unseen etc."}
    )
    task: Optional[str] = field(
        default="summarization",
        metadata={"help": "Task name, summarization (or summarization_{dataset} for pegasus) or translation"},
    )
    max_source_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_target_length: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    val_max_target_length: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded. "
            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
            "during ``evaluate`` and ``predict``."
        },
    )
    test_max_target_length: Optional[int] = field(
        default=300,
        metadata={
            "help": "The maximum total sequence length for test target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    n_train: Optional[int] = field(default=None, metadata={"help": "# training examples. None means use all."})
    n_val: Optional[int] = field(default=None, metadata={"help": "# validation examples. None means use all."})
    n_test: Optional[int] = field(default=None, metadata={"help": "# test examples. None means use all."})
    eval_beams: Optional[int] = field(default=None, metadata={"help": "# num_beams to use for evaluation."})
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={"help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined."},
    )

@dataclass
class EvalArguments:
    """
    Arguments pertaining to the evaluation of the model.
    """

    decode: Optional[str] = field(
        default='beam_search', metadata={"help": "Decoding method used, take in value of beam_search, nucleus"}
    )
    metric: Optional[str] = field(
        default='bleu', metadata={"help": "The metric used to evaluate the model, takes in value of bleu, rouge, meteor etc"}
    )
    compute_metric: Optional[bool] = field(
        default=False, metadata={"help": "whether to compute metrics while generating the outputs, must be False if num_samples > 1"}
    )
    num_beams: Optional[int] = field(
        default=5, metadata={"help": "beam size used to decode"}
    )
    num_samples: Optional[int] = field(
        default=1, metadata={"help": "Number of decoded sequence for each input"}
    )

In [3]:
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, EvalArguments, Seq2SeqTrainingArguments))

In [4]:
model_args, data_args, eval_args, training_args = parser.parse_json_file(json_file="train.json")

In [5]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name,
    cache_dir=model_args.cache_dir
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name,
    cache_dir=model_args.cache_dir,
)

In [7]:
if model_args.task_type == 'generation':
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name,
        config=config,
        cache_dir=model_args.cache_dir
    )
else:
    if model_args.model_name != 'microsoft/DialogRPT-updown' and config.num_labels is None:
        config.num_labels = int(model_args.num_labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name,
        config=config,
        cache_dir=model_args.cache_dir
    )

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.d

In [8]:
# Get datasets
train_dataset = (
    Seq2SeqDataset(
        tokenizer,
        type_path="train",
        task_type = model_args.task_type,
        mode = model_args.mode,
        data_dir=data_args.data_dir,
        n_obs=data_args.n_train,
        max_target_length=data_args.max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    )
    if model_args.mode == 'train'
    else None
)

FileNotFoundError: [Errno 2] No such file or directory: 'logical_fallacy/data/train.source'