In [1]:
#create python env

#conda install pip
#pip install transformers
#https://huggingface.co/docs/transformers/installation
#conda install -c anaconda pandas
#conda install numpy
#conda install -c conda-forge tqdm
#conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch
#conda install -c conda-forge matplotlib

#Dataset - SemEval 2010 task 8
#https://semeval2.fbk.eu/semeval2.php?location=data


In [2]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.optim import Adam

In [3]:
# create array with labels
labels_names = [
    'Cause-Effect(e1,e2)',
    'Cause-Effect(e2,e1)',
    'Instrument-Agency(e1,e2)',
    'Instrument-Agency(e2,e1)',
    'Product-Producer(e1,e2)',
    'Product-Producer(e2,e1)',
    'Content-Container(e1,e2)',
    'Content-Container(e2,e1)',
    'Entity-Origin(e1,e2)',
    'Entity-Origin(e2,e1)',
    'Entity-Destination(e1,e2)',
    'Entity-Destination(e2,e1)',
    'Component-Whole(e1,e2)',
    'Component-Whole(e2,e1)',
    'Member-Collection(e1,e2)',
    'Member-Collection(e2,e1)',
    'Message-Topic(e2,e1)',
    'Message-Topic(e1,e2)',
    'Other'
]

print("Number of relations "+str(len(labels_names)))

# Dataset dimentions

N_SAMPLE_TRAIN = 7109
N_SAMPLE_VAL = 891
N_SAMPLE_TEST = 2717

Number of relations 19


In [4]:
# loading the training data
PATH = '/home/isiragusa/irene/NLP General/'
f = open(PATH+"dataset/SemEval2010/TRAIN_FILE.TXT", "r")
line = f.readline()
train_complete = []
val_complete = []
for i in range(N_SAMPLE_TRAIN + N_SAMPLE_VAL):
    if re.match(r'[0-9]', line) :
        phrase = line.split("\t")[1]
        phrase = phrase[1:-3]

        entities = []

        phs = re.search('<e1>(.+?)</e1>', phrase)
        if phs:
            entities.append(phs.group(1))
        phs = re.search('<e2>(.+?)</e2>', phrase)
        if phs:
            entities.append(phs.group(1))

        phrase = phrase.replace("<e1>","").replace("</e1>","").replace("<e2>","").replace("</e2>","")

        line = f.readline()
        rel = line[:-1]
        # indexing labels 
        rel = labels_names.index(rel)
        # read comment 
        line = f.readline()
        # read space between phrases
        line = f.readline()
        # append the extracted elements
        if i>= N_SAMPLE_TRAIN:
            val_complete.append((phrase, entities[0], entities[1], rel))    
        else:
            train_complete.append((phrase, entities[0], entities[1], rel)) 
        # preparing for next block
        line = f.readline()

print("Number of train phrases "+str(len(train_complete)))
print("Number of validation phrases "+str(len(val_complete)))

Number of train phrases 7109
Number of validation phrases 891


In [5]:
# loading the test data
f = open(PATH+"dataset/SemEval2010/TEST_FILE.TXT", "r")
line = f.readline()
phrases_test = []
labels_test = []
id_test = []
test_complete = []
for i in range(N_SAMPLE_TEST):
    if re.match(r'[0-9]', line) :
        phrase = line.split("\t")
        id = phrase[0]
        phrase = phrase[1]
        phrase = phrase[1:-3]
        entities = []

        phs = re.search('<e1>(.+?)</e1>', phrase)
        if phs:
            entities.append(phs.group(1))
        phs = re.search('<e2>(.+?)</e2>', phrase)
        if phs:
            entities.append(phs.group(1))

        phrase = phrase.replace("<e1>","").replace("</e1>","").replace("<e2>","").replace("</e2>","")
        
        line = f.readline()
        rel = line[:-1]
        # indexing labels 
        rel = labels_names.index(rel)
        # read comment 
        line = f.readline()
        # read space between phrases
        line = f.readline()
        # append the new tuple
        phrases_test.append([phrase])
        labels_test.append([rel])
        id_test.append(id)
        test_complete.append((phrase, entities[0], entities[1],rel)) 
        # preparing for next block
        line = f.readline()

print("Number of test phrases "+str(len(test_complete)))

Number of test phrases 2717


In [6]:
# create the dataframes
df_train = pd.DataFrame.from_records(train_complete, columns = ['Phrase', 'E1', 'E2', 'Label'])
df_val = pd.DataFrame.from_records(val_complete, columns = ['Phrase', 'E1', 'E2', 'Label'])
df_test = pd.DataFrame.from_records(test_complete, columns = ['Phrase', 'E1', 'E2', 'Label'])

df_train.to_csv(PATH+"Esercitazione/train_cured.csv", index = False)
df_val.to_csv(PATH+"Esercitazione/val_cured.csv", index = False)
df_test.to_csv(PATH+"Esercitazione/test_cured.csv", index = False)

In [7]:
from transformers import AutoConfig, AutoModel, AutoTokenizer

model_name = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name, config=config)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
print("Loading "+model_name+" ...")
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    model.to(device)
    print(model_name+" loaded")

Loading bert-base-uncased ...
bert-base-uncased loaded


In [12]:
i = 7
sent = df_train['Phrase'][i]
print(sent)
tokens = tokenizer.tokenize(sent)
print(tokens)
tokens_ids = tokenizer(sent, add_special_tokens=False, return_tensors="pt")
print(tokens_ids)

tokens_ids = tokenizer(sent, add_special_tokens=True, return_tensors="pt")
print(tokens_ids)

People have been moving back into downtown
['people', 'have', 'been', 'moving', 'back', 'into', 'downtown']
{'input_ids': tensor([[2111, 2031, 2042, 3048, 2067, 2046, 5116]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[ 101, 2111, 2031, 2042, 3048, 2067, 2046, 5116,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [13]:
input = tokens_ids['input_ids'].to(device) 
att_mask = tokens_ids['attention_mask'].to(device) 
with torch.no_grad():
    last_hidden_states = model(input, attention_mask=att_mask)
input = input.detach().cpu()
att_mask = att_mask.detach().cpu()
emb = last_hidden_states[0][0,0,:].detach().cpu()

In [None]:
last_hidden_states

In [15]:
last_hidden_states[0].shape

torch.Size([1, 9, 768])

In [17]:
emb.shape

torch.Size([768])

In [None]:
def gen_tokens_mask(df, tokenizer):
    phrases_list = [elem for elem in df['Phrase']]
    tokenized = []
    for i, doc in enumerate(tqdm(phrases_list, desc='generating tokens')):
        tokenized.append(tokenizer(doc, add_special_tokens=True, return_tensors="pt"))

    return tokenized

tokenized_train = gen_tokens_mask(df_train, tokenizer)
tokenized_val = gen_tokens_mask(df_val, tokenizer)
tokenized_test = gen_tokens_mask(df_test, tokenizer)

def gen_embeddings_doc(tok_attention_mask, model, device):
    phrase_embeddings = []
    for i in tqdm(range(len(tok_attention_mask)), desc='gen embeddings phrases'): # for sentence
        input = torch.tensor(np.array(tok_attention_mask[i]['input_ids'])).to(device) 
        att_mask = torch.tensor(np.array(tok_attention_mask[i]['attention_mask'])).to(device) 
        with torch.no_grad():
            last_hidden_states = model(input, attention_mask=att_mask)
        input = input.detach().cpu()
        att_mask = att_mask.detach().cpu()
        emb = last_hidden_states[0][0,0,:].detach().cpu()
        phrase_embeddings.append(emb)
    
    return torch.stack(phrase_embeddings)

embeddings_train = gen_embeddings_doc(tokenized_train, model, device)
embeddings_val = gen_embeddings_doc(tokenized_val, model, device)
embeddings_test = gen_embeddings_doc(tokenized_test, model, device)