In [33]:
from transformers import BartTokenizer, BartForConditionalGeneration
import numpy as np

In [50]:
import random
import string

import pandas as pd
from nltk.corpus import stopwords

omitwords = [s for s in string.punctuation] + stopwords.words("english")


def _read(file):
    with open(file, "r") as f:
        return f.readlines()


def get_weights_and_n_remove(words, prop=0.15):
    """
    Gets weights for the sampling of the random words
    that will be masked. For that, it sets the weights
    of words belonging to punctuation to 0.
    Additionally, gets the number of words to be removed.
    """
    words_lower = [w.lower() for w in words]
    pos_omitwords = [(i, w) for i, w in enumerate(words_lower) if w in omitwords]
    pos_omitwords = [elem[0] for elem in pos_omitwords]
    divide = len(words) - len(pos_omitwords)
    weight = 1 / divide
    weights = [weight] * len(words_lower)
    for pos in pos_omitwords:
        weights[pos] = 0.0
    number_words = len(words) - len(pos_omitwords)
    n_remove = max(2, int(prop * number_words))
    return weights, n_remove


def _to_str(l):
    """Transforms sentence in list format to string, and replace multiple spaces"""
    return " ".join(l).replace(" +", " ")

def replace_multiple_masks(lista):
    newlist = []
    for i in range(len(lista)):
        if i == 0:
            newlist.append(lista[0])
        else:
            if lista[i] == "<mask>":
                if lista[i-1] != "<mask>":
                    newlist.append(lista[i])
            else:
                newlist.append(lista[i])
    return newlist

def perturb_sentence(s):
    """
    Perturb sentence in T5-fashion.
    First, it gets the weights for words sampling for masking, and the
    number of words to mask aswell. Then, it sets those word to <mask>.
    After that, masked tokens are replaced by extra_ids, concatenating
    those that appear together.
    With the masked sentence in list format (before substituting for
    extra_ids), we get the opposite sentence, which has words where
    masked sentence has <mask> and <mask> where masked sentence has
    words. Then, those masks in the opposite sentence (which will be
    the labels), are also transformed to extra_ids.

    Parameters
    ----------
    s: str
        Sentence to perturb.

    Returns
    -------
    new_sentence: str
        Sentence perturbed.
    opposite_sentence: str
        Labels of perturbed sentence.
    """
    s = s.replace("\n", "")
    new_sentence = []
    words = s.split(" ")
    weights, n_remove = get_weights_and_n_remove(words)
    words_mask = random.choices(words, weights=weights, k=n_remove)
    spans_lengths = np.random.poisson(lam=3.0, size=n_remove)
    print(spans_lengths)
    indexes_mask = []
    search = words.copy()
    for i in range(len(words_mask)):
        word_to_mask = words_mask[i]
        idx = search.index(word_to_mask)
        # search = search[idx + 1:]
        allidx = list(set([idx]+[idx+i_ for i_ in range(spans_lengths[i])]))
        indexes_mask.extend(allidx)
    new_sentence = words.copy()
    for idx in set(indexes_mask):
        print(new_sentence[idx])
        new_sentence[idx] = "<mask>"
    #opposite_sentence = get_opposite_sentence(new_sentence, words)
    #final_sentence = get_definitive_list(new_sentence)
    #final_opposite = get_definitive_list(opposite_sentence)
    new_sentence = replace_multiple_masks(new_sentence)
    return _to_str(new_sentence), s


def get_features_df(file):
    texts = _read(file)
    text_pairs = list(map(perturb_sentence, texts))
    df = pd.DataFrame(
        {
            "sentence": [pair[0] for pair in text_pairs],
            "labels": [pair[1] for pair in text_pairs],
        }
    )
    return df


In [3]:
texts_tr = _read("./data_2401/train.source")

In [4]:
texts_val = _read("./data_2401/val.source")

In [5]:
features_tr = get_features_df("./data_2401/train.source")

In [8]:
features_tr.iloc[0,0]

'Waffle-textured <mask> with a straight-cut neckline and adjustable straps with button fastening. a chest pouch pocket with a slogan print.'

In [9]:
texts_tr[0]

'Waffle-textured dungarees with a straight-cut neckline and adjustable straps with button fastening. Featuring a chest pouch pocket with a slogan print.\n'

In [51]:
perturb_sentence(texts_tr[0])

[1 3]
Waffle-textured
button
fastening.
Featuring


('<mask> dungarees with a straight-cut neckline and adjustable straps with <mask> a chest pouch pocket with a slogan print.',
 'Waffle-textured dungarees with a straight-cut neckline and adjustable straps with button fastening. Featuring a chest pouch pocket with a slogan print.')

In [13]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

In [14]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

In [15]:
device = "cuda:0"

In [16]:
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_a

In [17]:
t1 = 'Waffle-textured dungarees with a straight-cut neckline and adjustable straps with <mask> fastening. Featuring a <mask> pouch pocket with a slogan print.'

In [18]:
t1_enc = tokenizer(t1, return_tensors="pt").input_ids

In [19]:
l1_enc = tokenizer("<mask> button <mask> chest <mask>", return_tensors="pt").input_ids

In [29]:
l2_enc = tokenizer('Waffle-textured dungarees with a straight-cut neckline and adjustable straps with button fastening. Featuring a chest pouch pocket with a slogan print.', return_tensors="pt").input_ids

In [30]:
out = model(input_ids=t1_enc.to(device), labels=l2_enc.to(device))

In [31]:
out.loss

tensor(0.9043, device='cuda:0', grad_fn=<NllLossBackward>)

In [23]:
out.loss

tensor(12.6317, device='cuda:0', grad_fn=<NllLossBackward>)

In [25]:
dir(tokenizer)

['SPECIAL_TOKENS_ATTRIBUTES',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_tokens',
 '_additional_special_tokens',
 '_batch_encode_plus',
 '_batch_prepare_for_model',
 '_bos_token',
 '_cls_token',
 '_convert_id_to_token',
 '_convert_token_to_id',
 '_convert_token_to_id_with_added_voc',
 '_decode',
 '_encode_plus',
 '_eos_token',
 '_eventual_warn_about_too_long_sequence',
 '_from_pretrained',
 '_get_padding_truncation_strategies',
 '_mask_token',
 '_pad',
 '_pad_token',
 '_pad_token_type_id',
 '_save_pretrained',
 '_sep_token',
 '_tokenize',
 '_unk_token',
 'add_prefix_space',
 'add_special_tokens',
 'add_tokens',
 '

In [26]:
tokenizer.SPECIAL_TOKENS_ATTRIBUTES

['bos_token',
 'eos_token',
 'unk_token',
 'sep_token',
 'pad_token',
 'cls_token',
 'mask_token',
 'additional_special_tokens']

In [27]:
tokenizer.mask_token

'<mask>'

In [32]:
def bart_perturb_sentence(s):
    """Perturbs a sentence in a BART-like fashion."""
    labels = s
    pass

In [None]:
from utils_bart_perturbation import get_features_df

In [None]:
df_tr = get_features_df("./data_2401/train.source")