**Manually set environment variable.**

In [None]:
%env FND_ROOT=/workspace/fnd-building

**Set randomness sources first with original seed, for full reproducibility of results.**

In [None]:
import torch
import random
import numpy as np

In [None]:
OG_SEED = 30082010

In [None]:
np.random.seed(OG_SEED)
torch.manual_seed(OG_SEED)
random.seed(OG_SEED)

In [None]:
import gc
import json
import os
import transformers

from datetime import datetime

from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

from matplotlib import pyplot as plt

from sentence_transformers import SentenceTransformer, models

from transformers import AutoModel, AutoTokenizer

from torch import cuda
from torch.utils.data import Dataset, DataLoader

print(f"Pytorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

**Configuration constants.**

In [None]:
CUDA_DEVICE = 0
FND_ROOT=%env FND_ROOT
PLURALISMO_ROOT=f'{FND_ROOT}/datasets/datasets-fnd-pluralismo/OnlyRepliesTree'
TAG = 'TF'
SEQ_PADDING = True

EMBEDDINGS_ROOT = f"{FND_ROOT}/experiments/embeddings"
RUN_SUFFIX = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

**Clear cuda cache and perform garbage collection.**

In [None]:
torch.cuda.empty_cache()
gc.collect()

**Setup CUDA device if GPU is available.**

In [None]:
device = f"cuda:{CUDA_DEVICE}" if cuda.is_available() else 'cpu'
print(device)
print(torch.cuda.get_device_name(CUDA_DEVICE))

**Configure ekphrasis text preprocessor.**

In [None]:
text_processor = TextPreProcessor(
    
    # terms that will be normalized
    normalize=[
        'url',
        'email',
        'percent',
        'money',
        'phone',
        'user', 
        'time',
        'date',
        'number'
    ],
    
    # terms that will be annotated
    # annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    # corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    # unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [None]:
def loadAllPosts(PLURALISMO_ROOT):
        """
        Carga todos los posts en formato json desde el directorio ./post.
        
        Retorna:
        
        - all_posts, diccionario indexado por tweet id
        - labeled_posts, diccionaro con tweets etiquetados
        - number_of_tweets
        """
        
        def parseTwitterTree(tree_file):
            tree_data = list()
            for line in tree_file:
                f_first_part, second_part = line.split('->')                
                f, first_part = f_first_part.split(":")
                first_part = first_part.strip()
                first_part = first_part.replace("'", "\"")
                tree_data.append(json.loads(first_part))
            return tree_data
        
        ### Obtener diccionario con todos los posts
        all_posts = {}
        labels = {}
    
        subtrees = ['false', 'true']
        subtrees_count = {
            'false': 0,
            'true': 0,
            'imprecise': 0
        }
        print("all_posts before:")
        print(len(all_posts))
        print(all_posts)

        def retrieve_replies(tweet_info, accumulator):
            if "replies" not in tweet_info or len(tweet_info["replies"]) == 0:
                return accumulator
            else:
                new_accumulator = []
                for tweet_reply in tweet_info["replies"]:
                    reply_id = tweet_reply["id"]
                    new_accumulator += retrieve_replies(tweet_reply, [tweet_reply])
                return accumulator + new_accumulator

        for subtree_label in subtrees:            
            subtree_file_list = os.listdir(os.path.join(PLURALISMO_ROOT, subtree_label))
            print(f"Working with folder {subtree_label} with {len(subtree_file_list)} files")
            for file in subtree_file_list:
                if file.endswith(".json") and not file.endswith("_minf.json"):
                    print(f"Working with file {file}")
                    try:        
                        with open(os.path.join(PLURALISMO_ROOT, subtree_label, file), 'r') as f:
                            tweet_id  = file.split("_")[1].split(".")[0]
                            tweet_dic = json.load(f)
                            for tweet_info in tweet_dic["data"]:
                                if "conversation_id" in tweet_info:
                                    if tweet_info["conversation_id"] == str(tweet_id):
                                        all_posts[tweet_id] = tweet_info
                            
                            assert all_posts[tweet_id] is not None

                            nested_replies = retrieve_replies(all_posts[tweet_id], [])
                            print(f"Got {len(nested_replies)} nested replies")
                            for reply in nested_replies:
                                all_posts[reply["id"]] = reply
                            
                            subtrees_count[subtree_label] += 1
                            labels[tweet_id] = subtree_label
                            
                    except Exception as exc:
                        print("EXC:")
                        print(exc)

        print("all_posts after:")
        print(len(all_posts))
        print("Tweets etiquetados      : ", len(labels), " ", subtrees_count)

        seqs_lens = []
        labeled_posts = {}
        number_of_tweets = 0
        number_of_retweets = 0
        number_of_invalid_tweets = 0
        no_in_data = 0
        opened_files = 0
    
        for idx, (tweet_id, subtree_label) in enumerate(labels.items()):
            try:
                if  tweet_id in all_posts:
                    tree_path = os.path.join(PLURALISMO_ROOT, subtree_label, f"treefile_{tweet_id}_minf.json")
                    with open(tree_path) as tree_file:
                        opened_files += 1
                        print(f"File {tree_path} opened correctly")
                        
                        tree_data = parseTwitterTree(tree_file)

                        # ### Remover retweets
                        first = tree_data[0]
                        
                        without_rt = list(filter(lambda t: t[0] != tweet_id, tree_data[1:]))
                        number_of_retweets = number_of_retweets + (len(tree_data[1:]) - len(without_rt))                        
                        only_valid = list(filter(lambda t: t[0] in all_posts, without_rt))                        
                        number_of_invalid_tweets = number_of_invalid_tweets + (len(without_rt) - len(only_valid))
                        seqs_lens.append(len(only_valid))
                        
                        labeled_posts[tweet_id] = (labels[tweet_id], [first] + only_valid)
                        number_of_tweets = number_of_tweets + 1                
                else:
                     no_in_data = no_in_data + 1  

            except Exception as e:
                print(e)

        assert opened_files == len(labels)

        print("no_in_data              : ", no_in_data) ## están etiquetados, pero no en los post
        print("number_of_tweets        : ", number_of_tweets)        
        print("all_posts               : ", len(all_posts))
        print("number_of_retweets      : ", number_of_retweets) ## En árbol de propagación
        print("number_of_invalid_tweets: ", number_of_invalid_tweets) ## En árbol de propagación
        
        #La red neuronal necesita un tamaño fijo para la secuencia (datos de entrada)
        #¿Que largo de secuencia utilizar?
        counts = np.bincount(seqs_lens) ## seqs_len sólo de los 753
        mode_seq_len = np.argmax(counts)
        mean_seq_len = int(np.mean(seqs_lens))
        min__seq_len = min(seqs_lens)
        max__seq_len = max(seqs_lens) 

        print("len(seqs_lens)   : ", len(seqs_lens))
        print("min__seq_len: ", min__seq_len)
        print("max__seq_len: ", max__seq_len)
        print("mean_seq_len: ", mean_seq_len)
        print("mode_seq_len: ", mode_seq_len)

        tree_max_num_seq = mean_seq_len
        
        return (all_posts, labeled_posts, number_of_tweets, tree_max_num_seq, seqs_lens)

**Setup papermill parameters**. The cell below must be tagged with the 'parameters' tag. See: https://papermill.readthedocs.io/en/latest/usage-parameterize.html

In [None]:
# papermill parameters

## Must be set to True when runnin via papermill.
PAPERMILL = False

In [None]:
settings = {
    "CHECKPOINT": "bert-base-uncased",
}

In [None]:
if PAPERMILL:
    print("Importing plain tqdm")
    from tqdm import tqdm    
else:
    print("Importing auto tqdm")
    from tqdm.auto import tqdm    

In [None]:
(
    all_posts, 
    labeled_posts, 
    number_of_tweets, 
    tree_max_num_seq, 
    seqs_lens
) = loadAllPosts(PLURALISMO_ROOT)

In [None]:
regular_seqs_lens = np.array(list(filter(lambda x: x < 320, seqs_lens)))
tree_max_num_seq = int(np.floor(np.mean(regular_seqs_lens)))
tree_max_num_seq

In [None]:
plt.boxplot(seqs_lens)
plt.show()
print(np.mean(seqs_lens))
plt.boxplot(regular_seqs_lens)
plt.show()
print(np.mean(regular_seqs_lens))

### Labels: true, false (omit 'imprecise' label)

In [None]:
categories = ['true', 'false']
encoded_labels = [0, 1, 2]
idx2label = categories
label2idx = dict(zip(idx2label, encoded_labels))
num_categories = len(categories)

print(f"Number of categories: {num_categories}")
print(f"Labels to indices: {label2idx}")

In [None]:
def show_examples_per_label():
    label_list = [v[0] for k,v in labeled_posts.items()]
    for label in idx2label:
        idx = label2idx[label]
        qty = len(list(filter(lambda l: l == label, label_list)))
        print(f"Label {idx} ({label}): {qty}")

In [None]:
show_examples_per_label()

In [None]:
EMBEDDER = None
EMBEDDER_STRATEGY = None

In [None]:
def generate_XY_with_BERT(
    _all_posts,
    _emb_size,
    _number_of_tweets,
    _labeled_posts,
    _tree_max_num_seq,
    _categories,
    _bert_tokenizer,
    _bert_model,
    _sentence_model
):
    """
     Generate X, Y matrices with embeddings ready to be applied on a neural netowrk.
    In addition, it returns the list of words in post that are not found
    in the given vocabulary.

    It relies on global variables EMBEDDER and EMBEDDER_STRATEGY:
    
    - if EMBEDDER is 'RAW_BERT', then the _bert_tokenizer and _bert_model parameters are used
    to generate the embeddings.

    - if EMBEDDER is 'SENTENCE_BERT': the _sentence_model parameter is used to generate
    the embeddings.    
    
    Returns
    -------
    (X, Y, words_not_in_model)
    """
 

    def to_category_vector(_category, _categories):
        vector = np.zeros(len(_categories)).astype(np.float32)
        for i in range(len(_categories)):
            if _categories[i] == _category:
                vector[i] = 1.0
                break
        return vector

    ## padding al final, con empty
    def padOrTruncate(empty_tensor, max_num, orig_tensor):
        if not SEQ_PADDING:
            return orig_tensor

        len_orig_tensor = orig_tensor.size(0)

        if len_orig_tensor > max_num:
            # Truncate
            result = orig_tensor[:max_num]
        elif len_orig_tensor <= max_num:
            # Pad
            repeats = empty_tensor.repeat(max_num - len(orig_tensor), 1)
            result = torch.cat((orig_tensor, repeats))
        
        return result

    def generateBERTEmbedding(docTexts):
        global EMBEDDER
        global EMBEDDER_STRATEGY
        result = []

        # print(f"generate bert embedding {len(docTexts)}")
        # print(f"len docTexts: {len(docTexts)}")
        for t in docTexts:
            # print(f"len text: {len(t)}")

            preprocessed_text = " ".join(text_processor.pre_process_doc(t))
           

            # Default strategy: RAW-BERT
            if EMBEDDER is None:
                EMBEDDER = "RAW_BERT"

            assert EMBEDDER in ["RAW_BERT", "SENTENCE_BERT"]

            if EMBEDDER == "RAW_BERT":

                tokenized = _bert_tokenizer.encode_plus(
                    preprocessed_text,
                    padding=False,
                    truncation=True,
                )

                ids = torch.LongTensor(tokenized["input_ids"]).unsqueeze(0).to(device)
                mask = torch.LongTensor(tokenized["attention_mask"]).unsqueeze(0).to(device)
                type_ids = (
                    torch.LongTensor(tokenized["token_type_ids"]).unsqueeze(0).to(device)
                )
                with torch.no_grad():
                    # https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutput
                    out = _bert_model(
                        input_ids=ids, attention_mask=mask, token_type_ids=type_ids
                    )
                hidden_states = out["hidden_states"]

                # Default strategy for RAW-BERT embedder: embedding
                if EMBEDDER_STRATEGY is None:
                    EMBEDDER_STRATEGY = "embedding"

                assert EMBEDDER_STRATEGY in ["embedding", "pooler", "second_to_last", "sum_four_last"]

                if EMBEDDER_STRATEGY == "embedding":    
                    ## Use embedding output layer
                    embedding = hidden_states[0].cpu().detach()
                    mean_embedding = torch.mean(embedding, dim=1)
                    result.append(mean_embedding)
                elif EMBEDDER_STRATEGY == "pooler":    
                    ## Use pooler output
                    embedding = out["pooler_output"]
                    result.append(embedding)
                elif EMBEDDER_STRATEGY == "second_to_last":    
                    ## Second-to-last hidden layer
                    embedding = hidden_states[2].cpu().detach()
                    mean_embedding = torch.mean(embedding, dim=1)
                    result.append(mean_embedding)
                elif EMBEDDER_STRATEGY == "sum_four_last":    
                    ## Sum last four hidden
                    last_four_layers = [torch.mean(hidden_states[i], dim=1) for i in (-1, -2, -3, -4)]
                    tensor_last_four_layers = torch.stack(last_four_layers).squeeze(1)                    
                    sum_hidden_states = torch.sum(tensor_last_four_layers, dim=0).cpu().detach()
                    result.append(sum_hidden_states)

            elif EMBEDDER == "SENTENCE_BERT":

                # Default strategy for SENTENCE-BERT embedder: default
                if EMBEDDER_STRATEGY is None:
                    EMBEDDER_STRATEGY = "default"
                
                assert EMBEDDER_STRATEGY in ["default"]
                embedding = torch.Tensor(sentence_model.encode(preprocessed_text)).to(device)
                result.append(embedding)
                

        result = torch.stack(result).squeeze(1).cpu().detach()
        return result

    empty_tensor = torch.zeros([1, _emb_size])
    _num_categories = len(_categories)

    ## Calcula AWE de cada árbol
    print("Pre labeled_posts_awe")
    labeled_posts_awe = {
        k: (
            v[0],
            generateBERTEmbedding(list(map(lambda x: _all_posts[x[0]]["text"], v[1][0:_tree_max_num_seq]))),
        )
        for k, v in _labeled_posts.items()
    }
    print("Post labeled_posts_awe")

    ## Realiza padding o truncate a las secuencias
    print("Pre padded_labeled_posts_awe")
    padded_labeled_posts_awe = {
        k: (v[0], padOrTruncate(empty_tensor, _tree_max_num_seq, v[1]))
        for k, v in labeled_posts_awe.items()
    }
    print("Post padded_labeled_posts_awe")

    # Genera los datos X e Y para alimentar el modelo de red neuronal
    # Inicialmente con ceros y con la forma adecuada.
    X = np.zeros(shape=(_number_of_tweets, _tree_max_num_seq, _emb_size)).astype(
        np.float32
    )
    Y = np.zeros(shape=(_number_of_tweets, _num_categories)).astype(np.float32)

    # Asigna al vector X los datos correspondientes
    for idx, (tweet_id, tweet_data) in enumerate(
        list(padded_labeled_posts_awe.items())
    ):
        for jdx, tweet_d in enumerate(tweet_data[1]):
            if jdx == _tree_max_num_seq:
                break
            else:
                X[idx, jdx, :] = tweet_d

    # Asigna al vector Y los datos correspondientes
    for idx, (tweet_id, tweet_data) in enumerate(
        list(padded_labeled_posts_awe.items())
    ):
        Y[idx, :] = to_category_vector(tweet_data[0], _categories)

    print("X.shape: ", np.shape(X))
    print("Y.shape: ", np.shape(Y))
    return X, Y

In [None]:
checkpoints = [
    ("bert-base-multilingual-uncased", "BASE0"),
    ("eprovidel/CLM01_v2_BotEN", "CLM01"),
    ("eprovidel/CLM02_v2_BotES", "CLM02"),
    ("eprovidel/CLM03_v2_StanceEN", "CLM03"),
    ("eprovidel/CLM04_v2_BotEN_BotES", "CLM04"),
    ("eprovidel/CLM05_v2_BotES_BotEN", "CLM05"),
    ("eprovidel/CLM06_v2_BotEN_StanceEN", "CLM06"),
    ("eprovidel/CLM07_v2_BotES_StanceEN", "CLM07"),
    ("eprovidel/CLM08_v2_StanceEN_BotEN", "CLM08"),
    ("eprovidel/CLM09_v2_StanceEN_BotES", "CLM09"),
    ("eprovidel/CLM10_v2_BotEN_BotES_StanceEN", "CLM10"),
    ("eprovidel/CLM11_v2_BotES_BotEN_StanceEN", "CLM11"),
    ("eprovidel/CLM12_v2_BotEN_StanceEN_BotES", "CLM12"),
    ("eprovidel/CLM13_v2_BotES_StanceEN_BotEN", "CLM13"),
    ("eprovidel/CLM14_v2_StanceEN_BotEN_BotES", "CLM14"),
    ("eprovidel/CLM15_v2_StanceEN_BotES_BotEN", "CLM15"),
]

embedders_strategy = [
    ("RAW_BERT", "embedding"),
    ("RAW_BERT", "pooler"),
    ("RAW_BERT", "second_to_last"),
    ("RAW_BERT", "sum_four_last"),
    ("SENTENCE_BERT", "default"),
]

In [None]:
await send_telegram_notification(telegram_token, chat_id, "Started generating Pluralismo embeddings...")

In [None]:
for _embedder, _strategy in embedders_strategy:
    EMBEDDER = _embedder
    EMBEDDER_STRATEGY = _strategy
    print(">" * 80)
    print(f"Embedder: {_embedder} with strategy: {_strategy}")
    
    for _checkpoint, _shortcheckpoint in checkpoints:
        SAVE_SUFFIX = f"PLR_{_shortcheckpoint}_{TAG}_{EMBEDDER}_{EMBEDDER_STRATEGY}"
        print("#" * 60)
        print(f"Creating embedding: {SAVE_SUFFIX}")

        emb_size = None

        sentence_model = None
        if EMBEDDER == 'SENTENCE_BERT':
            word_embedding_model = models.Transformer(_checkpoint, max_seq_length=512)            
            emb_size = word_embedding_model._modules['auto_model'].config.hidden_size            
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
            sentence_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

        bert_model = None
        bert_tokenizer = None        
        if EMBEDDER == 'RAW_BERT':    
            bert_tokenizer = AutoTokenizer.from_pretrained(_checkpoint)
            bert_model = AutoModel.from_pretrained(_checkpoint, output_hidden_states=True)
            bert_model.to(device);
            emb_size = bert_model.config.hidden_size
        
        print("Recomputing Xy")
        X, y = generate_XY_with_BERT(
            all_posts,
            emb_size,
            number_of_tweets,
            labeled_posts,
            tree_max_num_seq,
            categories,
            bert_tokenizer,
            bert_model,
            sentence_model,
        )
        np.save(f"{EMBEDDINGS_ROOT}/X_{SAVE_SUFFIX}.npy", X, allow_pickle=False)
        np.save(f"{EMBEDDINGS_ROOT}/y_{SAVE_SUFFIX}.npy", y, allow_pickle=False)
        
        print("Finished generating Xy")
        print(X.shape)
        print(y.shape)
    
        del bert_tokenizer
        del bert_model
        del sentence_model
        torch.cuda.empty_cache()

In [None]:
print("End")