# Persuasion Techniques in Text of Memes - Inference with multi-label models



## Enironment Setup

##### Disk Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
folder_name = "/content/drive/MyDrive/persuasion_technique_detection/"

##### Imports

In [3]:
!pip install transformers datasets wandb evaluate accelerate -qU sklearn_hierarchical_classification

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.7/251.7 kB[0m [31

In [4]:
import json
import numpy as np
import pandas as pd
import torch
import subprocess
import json
import wandb
import os

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset

In [6]:
AVAIL_GPUS = 0
if torch.cuda.is_available():
    device = torch.device("cuda")
    AVAIL_GPUS = torch.cuda.device_count()
    print(f'There are {AVAIL_GPUS} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [7]:
wandb.login()
# setup wandb environment variables
os.environ['WANDB_PROJECT'] = "subtask1_transformer_encoder_classification"
os.environ['WANDB_ENTITY'] = "tumnlp"
os.environ["WANDB_LOG_MODEL"]= "end"

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Pre-trained Transformer Name

In [8]:
checkpoint = "bert-base-cased"
#checkpoint = "xlm-roberta-base"
#checkpoint = "xlnet-base-cased"
#checkpoint = "microsoft/deberta-v3-base"
#checkpoint = "albert-base-v2"


## Data Preprocessing

In [9]:
val_st1 = folder_name+"data/subtask1/validation.json"
dev_st1 = folder_name+"data/subtask1/dev_unlabeled.json"

In [10]:
val_data1 = pd.read_json(val_st1)
dev_data1 = pd.read_json(dev_st1)

#### Load into huggingface datasets

In [11]:
dataset_val_files = {"validation": val_st1}
dataset_test_files = {"test": dev_st1}
dataset_val = load_dataset("json",data_files=dataset_val_files)
dataset_test = load_dataset("json",data_files=dataset_test_files)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [12]:
techniques = [['Black-and-white Fallacy/Dictatorship', 'Loaded Language',
       'Glittering generalities (Virtue)', 'Thought-terminating cliché',
       'Whataboutism', 'Slogans', 'Causal Oversimplification', 'Smears',
       'Name calling/Labeling', 'Appeal to authority',
       'Exaggeration/Minimisation', 'Repetition', 'Flag-waving',
       'Appeal to fear/prejudice', 'Reductio ad hitlerum', 'Doubt',
       "Misrepresentation of Someone's Position (Straw Man)",
       'Obfuscation, Intentional vagueness, Confusion', 'Bandwagon',
       'Presenting Irrelevant Data (Red Herring)']]
len(techniques[0])

20

### Preprocess Multi-Labels

In [13]:
mlb = MultiLabelBinarizer()
mlb.fit(techniques)

#### Tokenize

In [14]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def tokenize_function(examples):
    encoding = tokenizer(examples["text"], truncation=True, padding=True)
    return encoding

dataset_val = dataset_val.remove_columns(["labels"])

tokenized_dataset_val = dataset_val.map(tokenize_function, batched=True)
tokenized_dataset_test = dataset_test.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### Predict validation set and create output json file

In [15]:
def get_preds(mlb,predicted_logits,threshold):
  sigmoid = torch.nn.Sigmoid()
  predicted_logits_tensor = torch.from_numpy(predicted_logits)
  probs = sigmoid(predicted_logits_tensor.squeeze())
  predictions = (probs > threshold).int()
  mask=predictions>0
  return list(zip(list(map(list,mlb.inverse_transform(predictions))),[probs[i][mask[i].bool()].tolist() for i in range(probs.size(0))]))

In [None]:
model_name = "model-bert-base-cased_2:v0"
threshold = 0.5

api = wandb.Api()
artifact=api.artifact(model_name)
model_dir=artifact.download()
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=len(techniques[0]), problem_type="multi_label_classification")
trainer = Trainer(model=model)
if torch.cuda.is_available():
  trainer.model = model.cuda()

trainer_val_predictions = get_preds(mlb,trainer.predict(tokenized_dataset_val['validation']).predictions, threshold)
trainer_test_predictions = get_preds(mlb,trainer.predict(tokenized_dataset_test['test']).predictions, threshold)

Only keep highest k predictions

In [None]:
def get_top_k_preds(preds, top_k=9):
  predicted_labels =[]
  for v in preds:
    if len(v[0])>top_k:
      pred_prob=list(zip(v[0],v[1]))
      pred_prob.sort(key=lambda x: -x[1])
      predicted_labels.append([el[0] for el in pred_prob[:top_k]])
    else:
      predicted_labels.append(v[0])
  return predicted_labels

### Create Prediction Output File

In [None]:
predicted_labels = get_top_k_preds(trainer_val_predictions)
val_preds = val_data1.drop(['labels', 'link', 'text'], axis=1)
val_preds.insert(1,'labels',pd.Series(predicted_labels))
val_preds['id'] = val_preds['id'].astype(str)

val_preds_output = val_preds.to_dict(orient='records')
val_output_file = folder_name + "subtask1/output/validation_output.json"
with open(val_output_file, "w") as output_file:
    json.dump(val_preds_output, output_file, indent=2,ensure_ascii=False)

In [None]:
predicted_labels = get_top_k_preds(trainer_test_predictions)
test_preds = dev_data1.drop(['text'], axis=1)
test_preds.insert(1,'labels',pd.Series(predicted_labels))
test_preds['id'] = test_preds['id'].astype(str)

test_preds_output = test_preds.to_dict(orient='records')
test_output_file = folder_name + "subtask1/output/dev_output.json"
with open(test_output_file, "w") as output_file:
    json.dump(test_preds_output, output_file, indent=2,ensure_ascii=False)

### Evaluate using the scorer script

In [None]:
scorer = folder_name + "subtask1/subtask_1_2a.py"

In [None]:
command = f'python3 {scorer} --gold_file_path {val_st1} --pred_file_path {val_output_file}'

result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, text=True)
output = result.stdout.strip()

parts = output.split('\t')
f1_h = parts[0].split('=')[1]
prec_h = parts[1].split('=')[1]
rec_h = parts[2].split('=')[1]

f1_h = float(f1_h)
prec_h = float(prec_h)
rec_h = float(rec_h)

hierarchical_metrics = {"f1_hierarchical": f1_h, "precision_hierarchical": prec_h, "recall_hierarchical": rec_h}
hierarchical_metrics