# Persuasion Techniques in Text of Memes

## Enironment Setup

##### Disk Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
folder_name = "/content/drive/MyDrive/tumnlp/"

##### Imports

In [3]:
!pip install transformers datasets wandb evaluate accelerate -qU sklearn_hierarchical_classification

In [4]:
import gc
import json
import numpy as np
import pandas as pd
import random
import torch
import subprocess
import json

In [5]:
from sklearn import preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

In [6]:
AVAIL_GPUS = 0
if torch.cuda.is_available():
    device = torch.device("cuda")
    AVAIL_GPUS = torch.cuda.device_count()
    print(f'There are {AVAIL_GPUS} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [7]:
import wandb
import os
wandb.login()
# setup wandb environment variables
os.environ['WANDB_PROJECT'] = "subtask1_transformer_encoder_classification"
os.environ['WANDB_ENTITY'] = "tumnlp"
os.environ["WANDB_LOG_MODEL"]= "end"

[34m[1mwandb[0m: Currently logged in as: [33mmahmudfami[0m ([33mtumnlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Pre-trained Transformer Name

In [8]:
checkpoint = "bert-base-cased"
#checkpoint = "xlm-roberta-base"
#checkpoint = "xlnet-base-cased"
#checkpoint = "microsoft/deberta-v3-base"
#checkpoint = "albert-base-v2"


## Data Preprocessing

In [9]:
dataset_folder = folder_name + "datasets/json_data/subtask1/"
train_st1 = dataset_folder + "train.json"
val_st1 = dataset_folder + "validation.json"
dev_st1 = dataset_folder + "dev_unlabeled.json"

In [10]:
train_data1=pd.read_json(train_st1)
val_data1 = pd.read_json(val_st1)
dev_data1 = pd.read_json(dev_st1)

In [11]:
train_data1

Unnamed: 0,id,text,labels,link
0,65635,THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ...,[Black-and-white Fallacy/Dictatorship],https://www.facebook.com/photo/?fbid=402355213...
1,67927,GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND AN...,"[Loaded Language, Glittering generalities (Vir...",https://www.facebook.com/amnesty/photos/531198...
2,68031,PAING PHYO MIN IS FREE!,[],https://www.facebook.com/amnesty/photos/427419...
3,77490,Move your ships away!\n\noooook\n\nMove your s...,[],https://www.facebook.com/rightpatriots/photos/...
4,67641,"WHEN YOU'RE THE FBI, THEY LET YOU DO IT.",[Thought-terminating cliché],https://www.facebook.com/AddictingInfoOrg/phot...
...,...,...,...,...
6995,67360,If your doctor prescribes you medication witho...,"[Loaded Language, Causal Oversimplification, T...",https://www.facebook.com/TheControversia/photo...
6996,70579,DEFENDS TRUMP. \nMADE ALLEGATIONS OF ELECTION ...,"[Loaded Language, Whataboutism]",https://www.facebook.com/PatriotFetch/photos/p...
6997,70305,I'm having trouble selling our incredibly enor...,[],https://www.facebook.com/PatriotFetch/photos/p...
6998,77769,I'm so happy we live in a world without slaver...,[],https://www.facebook.com/communism101/photos/5...


#### Load into huggingface datasets

In [12]:
from datasets import load_dataset
data_files = {"train": train_st1, "validation": val_st1}
st1_dataset = load_dataset("json",data_files=data_files)
st1_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'link', 'id'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['labels', 'text', 'link', 'id'],
        num_rows: 500
    })
})

##### Discard all samples without persuasion technique
*Or keep them as non-persuasive samples?*

In [13]:
st1_dataset['train'] = st1_dataset['train'].filter(lambda x : len(x['labels']) != 0)
st1_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'link', 'id'],
        num_rows: 5736
    })
    validation: Dataset({
        features: ['labels', 'text', 'link', 'id'],
        num_rows: 500
    })
})

In [14]:
techniques = [['Black-and-white Fallacy/Dictatorship', 'Loaded Language',
       'Glittering generalities (Virtue)', 'Thought-terminating cliché',
       'Whataboutism', 'Slogans', 'Causal Oversimplification', 'Smears',
       'Name calling/Labeling', 'Appeal to authority',
       'Exaggeration/Minimisation', 'Repetition', 'Flag-waving',
       'Appeal to fear/prejudice', 'Reductio ad hitlerum', 'Doubt',
       "Misrepresentation of Someone's Position (Straw Man)",
       'Obfuscation, Intentional vagueness, Confusion', 'Bandwagon',
       'Presenting Irrelevant Data (Red Herring)']]
num_labels = len(techniques[0])
num_labels

20

### Preprocess Multi-Labels

In [15]:
mlb = MultiLabelBinarizer()
mlb.fit(techniques)

#### Tokenize

In [16]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(examples):
    encoding = tokenizer(examples["text"], truncation=True)
    encoding['labels'] = mlb.transform(examples['labels']).astype(np.float32).tolist()
    return encoding

tokenized_datasets = st1_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'text', 'link', 'id', 'input_ids', 'attention_mask'],
        num_rows: 5736
    })
    validation: Dataset({
        features: ['labels', 'text', 'link', 'id', 'input_ids', 'attention_mask'],
        num_rows: 500
    })
})

Test Output tokenized Samples

In [17]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["id", "text", "link"]}
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([8, 20]),
 'input_ids': torch.Size([8, 80]),
 'attention_mask': torch.Size([8, 80])}

## Training

In [18]:
def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f)

def create_json_output(y_pred, y_true):
    # Transfrom numeric labels into textual labels
    y_pred = mlb.inverse_transform(y_pred)
    y_true = mlb.inverse_transform(y_true)
    # Convert y_true and y_pred to the required JSON format
    pred_json = [{"id": str(idx), "labels": pred_row} for idx, pred_row in enumerate(y_pred)]
    true_json = [{"id": str(idx), "labels": true_row} for idx, true_row in enumerate(y_true)]

    # Save to json files
    predictions_file = folder_name + "subtask1/output/tmp/predictions.json"
    gold_labels_file= folder_name + "subtask1/output/tmp/gold_labels.json"
    save_json(pred_json, predictions_file)
    save_json(true_json, gold_labels_file)
    return predictions_file, gold_labels_file

In [19]:
def calculate_hierarchical_metrics(predictions_file, gold_labels_file):
    # Run the scoring script
    scorer = folder_name + "subtask1/subtask_1_2a.py"
    command = f'python3 {scorer} --gold_file_path {gold_labels_file} --pred_file_path {predictions_file}'
    result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, text=True)
    output = result.stdout.strip()

    # Parse the output
    parts = output.split('\t')
    f1_h = float(parts[0].split('=')[1])
    prec_h = float(parts[1].split('=')[1])
    rec_h = float(parts[2].split('=')[1])

    return f1_h, prec_h, rec_h

In [20]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels

    # Create Json Output file
    predictions_file, gold_labels_file = create_json_output(y_pred, y_true)

    # compute metrics
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    f1_h, prec_h, rec_h = calculate_hierarchical_metrics(predictions_file, gold_labels_file)


    # return as dictionary
    metrics = {'f1_hierarchical': f1_h,
               'precision_hierarchical': prec_h,
               'recall_hierarchical': rec_h,
               'f1_micro': f1_micro_average,
               'accuracy': accuracy,
               }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [21]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(techniques[0]), problem_type="multi_label_classification")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments

output_folder = folder_name + "subtask1/output"
training_args = TrainingArguments(
    report_to = 'wandb',                    # enable logging to W&B
    run_name = 'bert-base_cased_2',          # name of the W&B run
    load_best_model_at_end = True,
    output_dir = output_folder,
    overwrite_output_dir = True,
    evaluation_strategy = 'steps',
    learning_rate = 5e-5,
    num_train_epochs = 50,
    #max_steps = 50000,
    logging_steps = 100,
    eval_steps = 1000,
    save_steps = 30000,
    metric_for_best_model = 'f1_hierarchical',
)

In [23]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

### Predict validation set and create output json file

In [None]:
trainer_predictions = trainer.predict(tokenized_datasets["validation"])

In [None]:
# Transform logits into probabilities
predicted_logits = trainer_predictions.predictions
sigmoid = torch.nn.Sigmoid()
predicted_logits_tensor = torch.from_numpy(predicted_logits)
probs = sigmoid(predicted_logits_tensor.squeeze())
# Get predictions that have higher probability than threshold
threshold = 0.5
predictions = (probs > threshold).int()
# Get labels in text form
predicted_labels = mlb.inverse_transform(predictions)

### Example

In [None]:
predicted_labels[20]

In [None]:
st1_dataset['validation']['labels'][20]

### Create Prediction Output File

In [None]:
predicted_labels = list(map(list, predicted_labels))
val_preds = val_data1.drop(['labels', 'link', 'text'], axis=1)
val_preds.insert(1,'labels',pd.Series(predicted_labels))
val_preds['id'] = val_preds['id'].astype(str)

In [None]:
val_preds_output = val_preds.to_dict(orient='records')
val_output_file = folder_name + "subtask1/output/validation_output.json"
with open(val_output_file, "w") as output_file:
    json.dump(val_preds_output, output_file, indent=2,ensure_ascii=False)

### Evaluate using the scorer script

In [None]:
scorer = folder_name + "subtask1/subtask_1_2a.py"

In [None]:
command = f'python3 {scorer} --gold_file_path {val_st1} --pred_file_path {val_output_file}'

# Run the command and get output
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, text=True)

# Extract the command output
output = result.stdout.strip()

# Parse the f1, precision, and recall values from the output
parts = output.split('\t')
f1_h = parts[0].split('=')[1]
prec_h = parts[1].split('=')[1]
rec_h = parts[2].split('=')[1]

# Convert to float if necessary
f1_h = float(f1_h)
prec_h = float(prec_h)
rec_h = float(rec_h)

hierarchical_metrics = {"f1_hierarchical": f1_h, "precision_hierarchical": prec_h, "recall_hierarchical": rec_h}
hierarchical_metrics

In [None]:
wandb.finish()