In [1]:
# Instalación de paquetes necesarios
!pip3 install sentencepiece
!pip3 install pytorch-lightning
!pip3 install --upgrade accelerate
!pip3 install emoji
!pip3 install framework-reproducibility
!pip3 install transformers
!pip3 install sacremoses # for data augmentation by back-translation
!pip3 install -U easynmt # for data augmentation by back-translation
!pip3 install transformers datasets
!pip3 install contractions
!pip3 install tensorflow
!pip3 install scikit-learn
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install sklearn-hierarchical-classification # para ejecutar subtask_1_2a.py
!pip3 install tf-keras
!pip3 install optuna



In [2]:
# Importar librerías
import json
import torch
import numpy as np
import os
import subprocess
import string # remove punctuations
import re # remove punctuations
import contractions # convert abbreviations to their complete forms
import tensorflow as tf
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, EarlyStoppingCallback

# max_length for tokenizer
MAX_LENGTH = 128

# order of label that was used for model fine-tuning, has to be the same order for zipping with the predicted probabilities
labels = ['Appeal to authority', 'Appeal to fear/prejudice', 'Bandwagon',
          'Black-and-white Fallacy/Dictatorship', 'Causal Oversimplification',
          'Doubt', 'Exaggeration/Minimisation', 'Flag-waving',
          'Glittering generalities (Virtue)', 'Loaded Language',
          "Misrepresentation of Someone's Position (Straw Man)",
          'Name calling/Labeling',
          'Obfuscation, Intentional vagueness, Confusion',
          'Presenting Irrelevant Data (Red Herring)', 'Reductio ad hitlerum',
          'Repetition', 'Slogans', 'Smears', 'Thought-terminating cliché',
          'Whataboutism']

# set up root path
root_path = os.getcwd()

# path to the datasets
data_path = f'{root_path}/data/'

# path to the models
model_path = f'{root_path}/fine_tuned_models/'

  from .autonotebook import tqdm as notebook_tqdm


# Carga de datos

In [3]:
with open(f'{data_path}scorer-baseline/arabic/ar_subtask1_majority_en_translated.txt', 'r') as f:
    ar_test = json.load(f)
with open(f'{data_path}scorer-baseline/bulgarian/bg_subtask1_majority_en_translated.txt', 'r') as f:
    bg_test = json.load(f)
with open(f'{data_path}scorer-baseline/english/en_subtask1_test_majority_smears.txt', 'r') as f:
    en_test = json.load(f)
with open(f'{data_path}scorer-baseline/north_macedonian/mk_subtask1_majority_en_translated.txt', 'r') as f:
    mk_test = json.load(f)
ar_test

[{'id': '00407',
  'text': "\nYour job is a surprise for your birthday. \nAnd the mercy of my parents, and don't even go through me.",
  'labels': ['Smears']},
 {'id': '00051',
  'text': 'Abdoun District polling centre',
  'labels': ['Smears']},
 {'id': '00022',
  'text': "♪ When you're a foreign soldier and they send you to fight in Syria ♪ ♪ And there they meet with a groove ♪ ♪ And they wear jeans and sandals ♪",
  'labels': ['Smears']},
 {'id': '00573',
  'text': "When you're back home late, your dad picks you up.\n",
  'labels': ['Smears']},
 {'id': '00005',
  'text': "Israel\nMurder, demolition and arrests\n* Settlements\n== sync, corrected by elderman == @elder_man \n♪ He stole the Palestinians' money ♪\nFOR THE PALESTINIAN AUTHORITY \nWe'll reconsider our relationship with the Aristocrats.",
  'labels': ['Smears']},
 {'id': '00008',
  'text': 'Arab citizen\nGovernment \nSomething simple of his rights.\nGod let us go home and the leader of the country.',
  'labels': ['Smears']},

# Preprocesamiento de los textos

In [4]:
# Funciones de limpieza
def remove_linebreaks(s): # s to avoid the word string
  """
  Removes linebreaks such as /n and /r in a string.

  Args:
    s (str): Input string

  Returns:
    s (str): Modified string
  """
  s = re.sub(r'\\n','\n',s) # some lines have '\\n', not sure why but best to covert them to '\n' for later processing, see dev_df.loc[75006].text
  return ' '.join(s.split())

def remove_links(s):
    """Takes a string and removes web links from it"""
    s = re.sub(r'http\S+', '', s)   # remove http links
    s = re.sub(r'bit.ly/\S+', '', s)  # remove bitly links
    s = re.sub(r'\[link\]', '', s )   # remove [link]
    s = re.sub(r'\[url\]', '', s )   # remove [url]
    s = re.sub(r'pic.twitter\S+','', s)
    return s

def remove_users(s):
    """Takes a string and removes res and @user information"""
    s = re.sub(r'(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', s)  # remove re-s
    s = re.sub(r'(@[A-Za-z]+[A-Za-z0-9-_]+)', '', s)  # remove sed at
    s = re.sub(r'\[user\]', '', s )   # remove [user]
    return s

def remove_hashtags(s):
    """Takes a string and removes any hash tags"""
    s = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', s)  # remove hash tags
    return s

def remove_av(s):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    s = re.sub('VIDEO:', '', s)  # remove 'VIDEO:' from start of s
    s = re.sub('AUDIO:', '', s)  # remove 'AUDIO:' from start of s
    return s

def remove_emojis(s):
    emoj = re.compile("["
        u"\U00002700-\U000027BF"  # Dingbats
        u"\U0001F600-\U0001F64F"  # Emoticons
        u"\U00002600-\U000026FF"  # Miscellaneous Symbols
        u"\U0001F300-\U0001F5FF"  # Miscellaneous Symbols And Pictographs
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00010000-\U0010FFFF"
        u"\U0001F680-\U0001F6FF"  # Transport and Map Symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\ufe0f"  # dingbats

                      "]+", re.UNICODE)
    return re.sub(emoj, '', s)

def remove_punctuations(s): # s to avoid the word string
  """
  Remove punctuations in a string

  Args:
    s (str): Input string

  Returns:
    s (str): Modified string
  """
  regex = re.compile('[%s]' % re.escape(string.punctuation))
  return regex.sub('', s)

# Función de eliminación de contracción
def expand_contraction(s):
    s = contractions.fix(s)
    return s

def process_text(s):
  """
  Wraps the functions together to clean strings and finally converts everything to lower case

  Args:
    s (str): Input string

  Returns:
    s(str): Modified string
  """
  s = remove_linebreaks(s)
  s = remove_links(s)
  s = remove_users(s)
  s = remove_hashtags(s)
  s = remove_av(s)
  s = remove_emojis(s)
  s = remove_punctuations(s)
  s = expand_contraction(s)
  return s.lower()

test = '''\This is why we're free\n\nThis is why!'''
print(process_text(test))

this is why were free this is why


  test = '''\This is why we're free\n\nThis is why!'''


In [5]:
lang_ids_dict = {}
lang_processed_texts_dict = {}

for lang, data in zip(['ar','bg','en','mk'],[ar_test,bg_test,en_test,mk_test]):
    ids = []
    processed_texts = []
    for d in data:
        ids.append(d['id'])
        processed_texts.append(process_text(d['text']))
    lang_ids_dict[lang] = ids
    lang_processed_texts_dict[lang] = processed_texts

# Cargo de modelo

In [6]:
language_abbrev_dict = {'ar':'arabic','en':'english','bg':'bulgarian','mk':'north_macedonian'}

def model_inference(model_name,processed_texts):
    """
    Takes a model_name (of a fine-tuned model) and a list of processed_texts for inference. The processed_texts are tokenized for prediction by the provided fine-tuned model.

    Args:
        model_name (str): Path to the fine-tuned model (e.g. google-bert/bert-base-uncased/original_train/)
        processed_texts (list of str): List of processed texts

    Returns:
        pred_probs (list of float): Logits converted to probabilities
    """
    # Load the fine-tuned model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Tokenize the input sentences
    inputs = tokenizer(processed_texts, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt")

    # Move the model to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get logits
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits

    # Convert logits to probabilities
    pred_probs = torch.sigmoid(logits)

    return pred_probs

def convert_probs_to_labels(pred_probs,label_optim_thresholds):
    """
    Use a dictionary of labels with their optimal thresholds to call predicted probabilities as 
    1s or 0s. The order of the labels in label_optim_threshold is important as it was saved 
    in the same order as the predicted probabilities!

    Args:
        pred_probs (np.array): NumPy array of nrow x 20 predicted probabilities (for each label)
        label_optim_threshold (dict): Optimal thresholds of individual labels derived using the dev_dataset

    Returns:
        pred_labels (np.array): NumPy array of nrow x 20 predicted label (for each label)
    """
    pred_labels = np.zeros(pred_probs.shape)

    for i, ele in enumerate(label_optim_thresholds):
        optim_threshold = label_optim_thresholds[ele]
        pred_labels[:,i] = np.where(pred_probs[:,i] >= optim_threshold, 1, 0)
    
    return pred_labels

def perform_model_inference(language,ids,processed_texts,model_name,label_optim_thresholds):
    """
    Wrapper function to perform model inference on test sets of different languages (input as abbreviation)
    """
    pred_probs = model_inference(model_name,processed_texts)
    pred_labels = convert_probs_to_labels(pred_probs,label_optim_thresholds)

    # Save results as dict with id and labels
    pred_dict = {}
    for id, pred_label in zip(ids,pred_labels):
        label_list = []
        for label, pred in zip(labels,pred_label):
            if pred == 1:
                label_list.append(label)
        pred_dict[id] = label_list

    return pred_dict

## Infer with all fine-tuned models in order to get the ensembled labels

In [7]:
model_checkpoints = [
        'google-bert/bert-base-uncased',
        'microsoft/deberta-v3-base',
        'FacebookAI/roberta-base',
        'distilbert/distilbert-base-uncased',
        'xlnet/xlnet-base-cased',
        'openai-community/gpt2',
        ]

# use the models finetuned on the original train dataset
train_df_name = 'original_train'

# dict to store the predictions generated for each model
lang_model_prediction_dict = {}

for language in language_abbrev_dict:
    lang_model_prediction_dict[language] = {}
    ids = lang_ids_dict.get(language)
    processed_texts = lang_processed_texts_dict.get(language)
    for model in model_checkpoints:
        model_name = f'{model_path}{model}/{train_df_name}/'
        with open(f"{model_name}validation_set_label_optimal_thresholds.json",'r') as f:
            label_optim_thresholds = json.load(f)
        pred_dict = perform_model_inference(language,ids,processed_texts,model_name,label_optim_thresholds)
        lang_model_prediction_dict[language][model] = pred_dict

In [8]:
lang_model_prediction_dict['mk']

{'google-bert/bert-base-uncased': {'mk_memes_2': [],
  'mk_memes_3': ['Name calling/Labeling'],
  'mk_memes_4': ['Black-and-white Fallacy/Dictatorship', 'Repetition'],
  'mk_memes_5': [],
  'mk_memes_6': ['Smears'],
  'mk_memes_7': [],
  'mk_memes_9': ['Name calling/Labeling'],
  'mk_memes_10': [],
  'mk_memes_11': ['Name calling/Labeling'],
  'mk_memes_13': [],
  'mk_memes_14': ['Doubt'],
  'mk_memes_15': ['Flag-waving'],
  'mk_memes_16': [],
  'mk_memes_17': ['Loaded Language'],
  'mk_memes_19': ['Name calling/Labeling', 'Smears'],
  'mk_memes_20': ['Name calling/Labeling'],
  'mk_memes_21': [],
  'mk_memes_22': [],
  'mk_memes_23': [],
  'mk_memes_25': [],
  'mk_memes_26': [],
  'mk_memes_28': ['Black-and-white Fallacy/Dictatorship',
   'Loaded Language',
   'Slogans'],
  'mk_memes_29': ['Loaded Language'],
  'mk_memes_30': ['Name calling/Labeling'],
  'mk_memes_31': [],
  'mk_memes_32': ['Name calling/Labeling'],
  'mk_memes_33': [],
  'mk_memes_34': [],
  'mk_memes_35': ['Name cal

## Ensemble by union

This method of ensembling simply takes the union of the labels predicted by all models and use that union as the predicted labels.

In [9]:
lang_ensemble_by_union = {}
# an abitrary list to iterate through the predictions
prediction_list = list(lang_model_prediction_dict['ar'].values())[0]

for language, model_prediction_dict in lang_model_prediction_dict.items():
    ensemble_by_union = {}
    # an abitrary list to iterate through the predictions
    prediction_list = list(model_prediction_dict.values())[0]
    for pred_id in prediction_list:
        ens_pred_labels = set()
        for model_name, prediction_dict in model_prediction_dict.items():
            ens_pred_labels = ens_pred_labels.union(set(prediction_dict.get(pred_id)))
        ensemble_by_union[pred_id] = list(ens_pred_labels)
    lang_ensemble_by_union[language] = ensemble_by_union

In [10]:
lang_ensemble_by_union

{'ar': {'00407': ['Loaded Language'],
  '00051': [],
  '00022': ['Loaded Language'],
  '00573': [],
  '00005': ['Name calling/Labeling',
   'Loaded Language',
   'Whataboutism',
   'Smears',
   'Black-and-white Fallacy/Dictatorship'],
  '00008': ['Name calling/Labeling',
   'Smears',
   'Black-and-white Fallacy/Dictatorship',
   'Slogans'],
  '00079': ['Name calling/Labeling', 'Loaded Language', 'Smears'],
  '00368': ['Name calling/Labeling', 'Loaded Language', 'Smears'],
  '00506': ['Smears'],
  '00318': ['Glittering generalities (Virtue)'],
  '00405': ['Smears', 'Slogans'],
  '00027': ['Name calling/Labeling'],
  '00404': ['Loaded Language', 'Smears'],
  '00170': ['Appeal to authority',
   'Name calling/Labeling',
   'Flag-waving',
   'Loaded Language'],
  '00134': ['Smears'],
  '00012': ['Name calling/Labeling', 'Loaded Language', 'Smears'],
  '00021': ['Smears'],
  '00056': ['Thought-terminating cliché',
   'Loaded Language',
   'Repetition',
   'Black-and-white Fallacy/Dictatorshi

In [11]:
for lang, language in language_abbrev_dict.items():
    with open(f'{data_path}scorer-baseline/{language}/{lang}_subtask1_majority_predicted_ensemble_by_union.txt', 'w') as f:
        json.dump([{'id': str(k), 'labels': v} for k, v in lang_ensemble_by_union[lang].items()], f, indent=4)
    # perform the SemEval hierarchical evaluation on the predicted labels and save the output in eval folder
    file_ = open(f'{data_path}scorer-baseline/{language}/SemEval_hierarchical_eval_ensemble_by_union.txt', "w")
    if lang == 'en':
        subprocess.Popen(f'python3 ./data/scorer-baseline/subtask_1_2a.py -g ./data/scorer-baseline/{language}/{lang}_subtask1_test_majority_smears.txt -p ./data/scorer-baseline/{language}/{lang}_subtask1_majority_predicted_ensemble_by_union.txt',shell=True,stdout=file_)
    else:
        subprocess.Popen(f'python3 ./data/scorer-baseline/subtask_1_2a.py -g ./data/scorer-baseline/{language}/{lang}_subtask1_majority.txt -p ./data/scorer-baseline/{language}/{lang}_subtask1_majority_predicted_ensemble_by_union.txt',shell=True,stdout=file_)
    file_.close()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av