In [1]:
import torch
from transformers import DistilBertTokenizerFast
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [42]:
path_test = 'data_model_mlm/202206171000_test.csv'

In [43]:
def read_passages(path_data, test_size=0):
    df = pd.read_csv(path_data)

    pairs_mask_embedding = [(eval(str_masks), eval(str_embedding)) for str_masks, str_embedding in zip(df['masks'], df['embedding'])]
    
    samples = []
    labels = []
                             
    for masks, embedding in pairs_mask_embedding:

        while len(embedding) < MAX_LENGTH:
            embedding.append(0)
        # end

        attention = list([1 for _ in range(MAX_LENGTH)])

        pt_embedding = torch.LongTensor(embedding)
        pt_label = torch.LongTensor(embedding)
        pt_attention = torch.LongTensor(attention)

        pt_index_masks = torch.LongTensor(masks)

        pt_embedding.index_fill_(0, pt_index_masks, 103)
        pt_attention.masked_fill_(pt_attention == 103, 0)
        pt_attention.masked_fill_(pt_attention == 0, 0)

        samples.append((pt_embedding, pt_attention))
        labels.append(pt_label)
    # end
    
    
    if test_size > 0:
        return train_test_split(samples, labels, test_size=test_size, random_state=234)
    else:
        return (samples, samples, labels, labels)
    # end
# end

In [45]:
samples_train, samples_eval, labels_train, labels_eval = read_passages(path_test, 0.2)

In [47]:
samples_train[0]

(tensor([  101,  2335, 15464,  2361,  3478,  2012,  2377, 21296,  1058,  2213,
          1041,  8873,  1050,  2615,  4168,  2335, 15464,  2361,  4708,  4638,
          1058,  2213,  3231,  3645,  2193, 12997,  4769,  4708,  4130,  2188,
          7309,  2573, 15327,  2019, 19307, 26237,  3645,  2193,  2019, 19307,
          5443, 27921,  2063,  2175,  2015, 27354,  2691,  1058,  2213,  2131,
         12997,  1061, 19968,  2193, 10611,  2334, 15006,  2102,  3478,  3478,
          2000,  2131,  1058,  2213,  3231,  3645,  2193, 12997,  4769,  2335,
         15464,  2361,  4708,  5604,  6164,  2349,  2000,  4945,  4708,  4130,
          2188,  7309,  2573, 15327,  2019, 19307, 26237,  3645,  2193,  2019,
         19307,  5443, 27921,  2063,  2175,  2015, 27354,  2691,  3231,  5343,
          1061, 19968,  2193, 10611,  2334, 15006,  2102,  3478,  6164,  5604,
          2043,  6164,  5604,  2043,  8246,  2003,  2275,  2000,  2995,  1999,
          3231,  2553, 21296,  1058,  2213,  1041,  

In [4]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)


pairs = [['good morning', 'my baby'],['good evening', 'my baby']]
tokenizer.batch_encode_plus(pairs, truncation=True, padding='max_length', max_length=512,
                                                  return_tensors='pt')

{'input_ids': tensor([[ 101, 2204, 2851,  ...,    0,    0,    0],
        [ 101, 2204, 3944,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [26]:
torch.IntTensor([1,2]) == 1

In [51]:
torch.IntTensor([1,2]).unsqueeze(0)

In [5]:
result = tokenizer.encode_plus('good morning my friend', truncation=True, max_length=512,
                                                  return_tensors='pt')

In [6]:
a = result.input_ids
b = a.squeeze(0).index_fill_(0, torch.LongTensor([1,2]), 0)
b.masked_fill_(b == 0, 1)

tensor([ 101,    1,    1, 2026, 2767,  102])

In [21]:
(a.squeeze(0) == 1).nonzero(as_tuple=True)[0]

tensor([1, 2])

In [23]:
a = torch.Tensor([[1,2],[3,4]])

In [73]:
torch.LongTensor([1]).tolist()[0]

1

In [83]:
DICT_labelOrigin_labelTarget = {
    'infra': 'infrastructure',
    'targetvm': 'machine',
    'testcase': 'test',
    'nimbus': 'environment',
    'usererror': 'user',
    'product': 'product',
    'testbed': 'server'
}

In [84]:
dict_labelorigin_codetarget = {k: tokenizer.convert_tokens_to_ids(v) for k,v in DICT_labelOrigin_labelTarget.items()}

In [85]:
dict_labelorigin_codetarget

{'infra': 6502,
 'targetvm': 3698,
 'testcase': 3231,
 'nimbus': 4044,
 'usererror': 5310,
 'product': 4031,
 'testbed': 8241}

In [86]:
dict_code_label = {v:k for k,v in dict_labelorigin_codetarget.items()}

In [87]:
dict_code_label

{6502: 'infra',
 3698: 'targetvm',
 3231: 'testcase',
 4044: 'nimbus',
 5310: 'usererror',
 4031: 'product',
 8241: 'testbed'}

In [88]:
index_labels = sorted(dict_code_label.keys())

In [89]:
labels = [dict_code_label[i] for i in index_labels]

In [90]:
index_labels

[3231, 3698, 4031, 4044, 5310, 6502, 8241]

In [91]:
labels

['testcase', 'targetvm', 'product', 'nimbus', 'usererror', 'infra', 'testbed']

In [92]:
torch.index_select(torch.LongTensor([1,2,3]), 0, torch.LongTensor([0,2]))

tensor([1, 3])

In [93]:
torch.Tensor([1]).squeeze(0).item()

1.0

In [97]:
tokenizer.convert_ids_to_tokens([3231, 3698, 4031, 4044, 5310, 6502, 8241])

['test',
 'machine',
 'product',
 'environment',
 'user',
 'infrastructure',
 'server']

In [98]:
tokenizer.encode('good mornning asd d server')

[101, 2204, 22822, 23500, 3070, 2004, 2094, 1040, 8241, 102]