In [1]:
import functools
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import tqdm
import transformers
from transformers_interpret import SequenceClassificationExplainer

In [2]:
cols = ['label', 'text']
df = pd.read_csv('../data/greek_fake_news.csv')
df.rename(columns = {"is_fake": "label"}, inplace = True)

raw_datasets = Dataset.from_pandas(df[cols])
raw_datasets = raw_datasets.train_test_split(train_size = 0.8)


In [3]:
transformer_name = 'nlpaueb/bert-base-greek-uncased-v1'

tokenizer = transformers.AutoTokenizer.from_pretrained(transformer_name)

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=512, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 80
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 20
    })
})

In [6]:
small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"]
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(transformer_name, num_labels=2)

Some weights of the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir = "model",
                                  overwrite_output_dir=True,
                                  per_device_train_batch_size = 4,
                                  per_device_eval_batch_size = 4,
                                  num_train_epochs= 5)

In [9]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 80
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 100


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=0.1457185935974121, metrics={'train_runtime': 35.8508, 'train_samples_per_second': 11.157, 'train_steps_per_second': 2.789, 'total_flos': 105244422144000.0, 'train_loss': 0.1457185935974121, 'epoch': 5.0})

In [11]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 4


{'eval_loss': 0.329243928194046,
 'eval_accuracy': 0.95,
 'eval_runtime': 0.622,
 'eval_samples_per_second': 32.154,
 'eval_steps_per_second': 8.039,
 'epoch': 5.0}

In [28]:
txt = """
Σύμφωνα με το σχέδιο, οι εργοδότες του Μίτσιγκαν θα μπορούσαν να χρησιμοποιούν μικροτσίπ, αλλά δεν θα μπορούσαν (προς το παρόν) να υποχρεώσουν τους εργαζομένους να εμφυτεύσουν τέτοιες συσκευές.
"""

cls_explainer = SequenceClassificationExplainer(
    model,
    tokenizer)
word_attributions = cls_explainer(txt)

cls_explainer.visualize()

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,2.63,"[CLS] συμφωνα με το σχεδιο , οι εργοδοτες του μιτ ##σι ##γκαν θα μπορουσαν να χρησιμοποιουν μικρο ##τσι ##π , αλλα δεν θα μπορουσαν ( προς το παρον ) να υπο ##χρεω ##σουν τους εργαζομενους να εμ ##φυτευ ##σουν τετοιες συσκευες . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,LABEL_1 (1.00),LABEL_1,2.63,"[CLS] συμφωνα με το σχεδιο , οι εργοδοτες του μιτ ##σι ##γκαν θα μπορουσαν να χρησιμοποιουν μικρο ##τσι ##π , αλλα δεν θα μπορουσαν ( προς το παρον ) να υπο ##χρεω ##σουν τους εργαζομενους να εμ ##φυτευ ##σουν τετοιες συσκευες . [SEP]"
,,,,
