In [1]:
import pandas as pd
from dataclasses import dataclass, field
from transformers.models.electra.modeling_electra import ElectraClassificationHead
from transformers.trainer_utils import EvaluationStrategy
from typing import Optional
import sys
import os
from logical_fallacy.codes_for_models.finetune.util import *
from logical_fallacy.codes_for_models.finetune.evaluate import *
from pathlib import Path

from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForSequenceClassification,
    AutoModelForMultipleChoice,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    HfArgumentParser,
    Seq2SeqTrainer,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    ElectraTokenizerFast,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [4]:
path = "logical_fallacy/data/"
train = pd.read_csv(path + "climate_train_mh.csv")
dev = pd.read_csv(path + "climate_dev_mh.csv")
test = pd.read_csv(path + "climate_test_mh.csv")
train.shape[0] + dev.shape[0] + test.shape[0]

1079

In [2]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    task_type: str = field(
        metadata={"help": "Task type, can be either generation or classification"}
    )
    num_labels: str = field(
        metadata={"help": "Number of labels, used for sequence classification"}
    )
    mode: str = field(
        metadata={"help": "mode, can be either train or test"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."})
    freeze_embeds: bool = field(default=False, metadata={"help": "Whether  to freeze the embeddings."})
        

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
    )
    test_type: Optional[str] = field(
        default="test", metadata={"help": "The type_path of the test file, test.seen, test.unseen etc."}
    )
    task: Optional[str] = field(
        default="summarization",
        metadata={"help": "Task name, summarization (or summarization_{dataset} for pegasus) or translation"},
    )
    max_source_length: Optional[int] = field(
        default=128,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_target_length: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    val_max_target_length: Optional[int] = field(
        default=64,
        metadata={
            "help": "The maximum total sequence length for validation target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded. "
            "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
            "during ``evaluate`` and ``predict``."
        },
    )
    test_max_target_length: Optional[int] = field(
        default=300,
        metadata={
            "help": "The maximum total sequence length for test target text after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    n_train: Optional[int] = field(default=None, metadata={"help": "# training examples. None means use all."})
    n_val: Optional[int] = field(default=None, metadata={"help": "# validation examples. None means use all."})
    n_test: Optional[int] = field(default=None, metadata={"help": "# test examples. None means use all."})
    eval_beams: Optional[int] = field(default=None, metadata={"help": "# num_beams to use for evaluation."})
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={"help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined."},
    )

@dataclass
class EvalArguments:
    """
    Arguments pertaining to the evaluation of the model.
    """

    decode: Optional[str] = field(
        default='beam_search', metadata={"help": "Decoding method used, take in value of beam_search, nucleus"}
    )
    metric: Optional[str] = field(
        default='bleu', metadata={"help": "The metric used to evaluate the model, takes in value of bleu, rouge, meteor etc"}
    )
    compute_metric: Optional[bool] = field(
        default=False, metadata={"help": "whether to compute metrics while generating the outputs, must be False if num_samples > 1"}
    )
    num_beams: Optional[int] = field(
        default=5, metadata={"help": "beam size used to decode"}
    )
    num_samples: Optional[int] = field(
        default=1, metadata={"help": "Number of decoded sequence for each input"}
    )

In [3]:
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, EvalArguments, Seq2SeqTrainingArguments))

In [4]:
model_args, data_args, eval_args, training_args = parser.parse_json_file(json_file="train.json")

  return torch._C._cuda_getDeviceCount() > 0


In [5]:
training_args.__dict__

{'output_dir': 'LF_train/results/fallacy/propa/deberta',
 'overwrite_output_dir': True,
 'do_train': True,
 'do_eval': True,
 'do_predict': False,
 'evaluation_strategy': <IntervalStrategy.STEPS: 'steps'>,
 'prediction_loss_only': False,
 'per_device_train_batch_size': 8,
 'per_device_eval_batch_size': 128,
 'per_gpu_train_batch_size': None,
 'per_gpu_eval_batch_size': None,
 'gradient_accumulation_steps': 2,
 'eval_accumulation_steps': None,
 'eval_delay': 0,
 'learning_rate': 1e-05,
 'weight_decay': 0.0,
 'adam_beta1': 0.9,
 'adam_beta2': 0.999,
 'adam_epsilon': 1e-08,
 'max_grad_norm': 1.0,
 'num_train_epochs': 5,
 'max_steps': -1,
 'lr_scheduler_type': <SchedulerType.LINEAR: 'linear'>,
 'warmup_ratio': 0.0,
 'warmup_steps': 1931,
 'log_level': 'passive',
 'log_level_replica': 'passive',
 'log_on_each_node': True,
 'logging_dir': 'LF_train/results/fallacy/propa/logs',
 'logging_strategy': <IntervalStrategy.STEPS: 'steps'>,
 'logging_first_step': False,
 'logging_steps': 300,
 'loggi

In [6]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name,
    cache_dir=model_args.cache_dir
)

In [29]:
model_args.model_name

'microsoft/deberta-base'

In [27]:
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name,
    cache_dir=model_args.cache_dir,
)

loading configuration file config.json from cache at /home/crarojasca/.cache/huggingface/hub/models--microsoft--deberta-base/snapshots/0d1b43ccf21b5acd9f4e5f7b077fa698f05cf195/config.json
Model config DebertaConfig {
  "_name_or_path": "microsoft/deberta-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "transformers_version": "4.23.1",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

loading file vocab.json from cache at /home/crarojasca/.cache/huggingface/hub/models--micr

In [8]:
if model_args.task_type == 'generation':
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name,
        config=config,
        cache_dir=model_args.cache_dir
    )
else:
    if model_args.model_name != 'microsoft/DialogRPT-updown' and config.num_labels is None:
        config.num_labels = int(model_args.num_labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name,
        config=config,
        cache_dir=model_args.cache_dir
    )

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['pooler.dense.weight', 'classi

In [9]:
# Get datasets
train_dataset = Seq2SeqDataset(
    tokenizer,
    type_path="train",
    task_type = model_args.task_type,
    mode = model_args.mode,
    data_dir=data_args.data_dir,
    n_obs=data_args.n_train,
    max_target_length=data_args.max_target_length,
    max_source_length=data_args.max_source_length,
    prefix=model.config.prefix or "",
)

In [10]:
data_args.data_dir

'logical_fallacy/data/'

In [11]:
eval_dataset = Seq2SeqDataset(
    tokenizer,
    type_path="dev",
    task_type = model_args.task_type,
    mode = model_args.mode,
    data_dir=data_args.data_dir,
    n_obs=data_args.n_val,
    max_target_length=data_args.val_max_target_length,
    max_source_length=data_args.max_source_length,
    prefix=model.config.prefix or "",
)

In [12]:
test_dataset = (
        Seq2SeqDataset(
            tokenizer,
            type_path=data_args.test_type,
            task_type = model_args.task_type,
            mode = model_args.mode,
            data_dir=data_args.data_dir,
            n_obs=data_args.n_test,
            max_target_length=data_args.test_max_target_length,
            max_source_length=data_args.max_source_length,
            prefix=model.config.prefix or "",
        )
    )

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset if training_args.do_predict else eval_dataset,
    data_collator=Seq2SeqDataCollator(tokenizer, config.decoder_start_token_id,model_args.task_type, model_args.mode, data_args),
    tokenizer=tokenizer,
)

In [14]:
# if model_args.mode == 'train':
#     check_output_dir(training_args)#check if output_dir exists and raises error if it exists over_wirte=False

#     set_seed(training_args.seed)#set training seed
#     if model_args.freeze_embeds:
#         freeze_embeds(model)
#     if model_args.freeze_encoder:
#         freeze_params(model.get_encoder())
#         assert_all_frozen(model.get_encoder())
#     trainer.train()
#     trainer.save_model(Path(training_args.output_dir).joinpath("best-epoch"))#save best epoch

# elif model_args.mode == 'output_prob':
#     out_prob(model, tokenizer, trainer.get_test_dataloader(test_dataset), Path(training_args.output_dir).joinpath("prob.txt"))

In [15]:
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [16]:
model =  AutoModelForSequenceClassification.from_pretrained(
    '/home/crarojasca/Monash/MDFI_misinformation/logical_fallacy/saved_models/electra-logicclimate', 
    num_labels=3
)

loading configuration file /home/crarojasca/Monash/MDFI_misinformation/logical_fallacy/saved_models/electra-logicclimate/config.json
Model config ElectraConfig {
  "_name_or_path": "/home/crarojasca/Monash/MDFI_misinformation/logical_fallacy/saved_models/electra-logicclimate",
  "architectures": [
    "ElectraForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 768,
  "finetuning_task": "mnli",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classifi

In [25]:
test_dataset.src_file = "./logical_fallacy/data/test.source"
test_dataset.tgt_file = "./logical_fallacy/data/test.target"
test_dataset.len_file = 1350

In [24]:
test_dataset

<logical_fallacy.codes_for_models.finetune.util.Seq2SeqDataset at 0x7fc83e7dbdf0>

In [23]:
test_dataset

<logical_fallacy.codes_for_models.finetune.util.Seq2SeqDataset at 0x7fc83e7dbdf0>

In [26]:
for batch in test_dataset:
    print(test_dataset)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AssertionError: empty source line for index 1351

In [20]:
for inputs in tqdm(list(test_dataset)):
    print(inputs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AssertionError: empty source line for index 1351

In [18]:
with torch.no_grad():
    for inputs in tqdm(list(test_dataset)):
        print(inputs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AssertionError: empty source line for index 1351

In [19]:
with torch.no_grad():
    for inputs in tqdm(list(test_dataset)):
        inputs = {k: v.cuda() for k, v in inputs.items() if k!='labels'}
        outputs = model(**inputs)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



AssertionError: empty source line for index 1351

In [33]:
import linecache

test_dataset.src_file = "./logical_fallacy/data/test.source"
test_dataset.tgt_file = "./logical_fallacy/data/test.target"
test_dataset.len_file = "./logical_fallacy/data/test.len"

In [30]:
test_dataset.__dict__

{'src_file': PosixPath('logical_fallacy/data/dev.source'),
 'tgt_file': PosixPath('logical_fallacy/data/dev.target'),
 'len_file': PosixPath('logical_fallacy/data/dev.len'),
 'src_lens': [127,
  150,
  150,
  108,
  270,
  120,
  70,
  297,
  275,
  333,
  74,
  99,
  122,
  110,
  38,
  73,
  129,
  56,
  122,
  374,
  89,
  173,
  326,
  128,
  141,
  104,
  209,
  208,
  341,
  120,
  268,
  235,
  144,
  170,
  193,
  172,
  75,
  311,
  229,
  329,
  351,
  72,
  295,
  89,
  91,
  133,
  249,
  240,
  237,
  438,
  281,
  169,
  119,
  223,
  171,
  154,
  203,
  163,
  83,
  87,
  141,
  172,
  131,
  114,
  245,
  125,
  166,
  232,
  432,
  179,
  369,
  315,
  284,
  174,
  143,
  220,
  305,
  422,
  469,
  115,
  311,
  187,
  253,
  138,
  337,
  197,
  190,
  109,
  250,
  287,
  124,
  208,
  141,
  158,
  275,
  406,
  139,
  253,
  201,
  385,
  222,
  165,
  367,
  217,
  129,
  393,
  274,
  153,
  202,
  198,
  50,
  307,
  158,
  390,
  204,
  175,
  79,
  254,
  1

In [34]:
linecache.getline(str("./logical_fallacy/data/test.source"), 1)

'the text has the logical fallacy of cherry-picking </s> green activists are at war with the greatest american foe since the axis powers — or so they say . the latest democratic party platform compares the fight against global warming to world war ii . using terms such as " battlefield , " " siege , " and " front , " those opposed this " war effort " have been labeled anything from nazis to holocaust deniers . ( i personally have been called a sociopath by climate activist joe romm of the center for american progress , another story . ) the upcoming election has inspired dire concern . do n’t “ vote for climate catastrophe ” warned a washington post editorialist . “ at this point , ” stated michael klare , professor of peace and world security at hampshire college , “ electing green - minded leaders , stopping climate deniers ( or ignorers ) from capturing high office , and opposing fossil -fueled ultranationalism is the only realistic path to a habitable planet . ” desperation backfir

In [26]:
import os

os.path.isfile("/home/crarojasca/Monash/MDFI_misinformation/logical_fallacy/data/test.source")

True