# (Colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 36.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.1-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
dir_path = "/content/drive/My Drive/Tirocinio NLP e tesi"

# Imports

In [None]:
import copy

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
import torch
import random
import numpy as np
import pandas as pd
import re
import os
import itertools

# Reproducibility

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def make_reproducible():
    torch.manual_seed(0)
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    random.seed(0)
    np.random.seed(0)
    if device == 'cuda':
        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ":16:8"
make_reproducible()

# Functions

### Data loading

In [None]:
def load_data(goal_class, goal_phen, data_folder_path):
    if goal_class not in ('cue','scope'):
        raise ValueError("'goal_class' must be either 'cue' or 'scope'.")
    if goal_phen not in ('neg','spec'):
        raise ValueError("'goal_phen' must be either 'neg' or 'spec'.")
    
    def check_and_augment_data(begin_cue, inside_cue, begin_augment, inside_augment, cues, sentence_labels, sentence, token_ids, sentences, labels):
        if begin_cue in cues:
            unique_values = [] # used a list to preserve order across executions for reproducibility
            for scope in sentence_labels:
                for individual_scope in scope.split(','):
                    individual_scope = individual_scope.strip().strip('\n').strip()
                    if individual_scope != '_' and individual_scope not in unique_values:
                        unique_values.append(individual_scope)
            for value in sorted(unique_values):
                sentence_augmented = copy.deepcopy(sentence)
                sentence_augmented[token_ids.index(value)] = begin_augment + sentence[token_ids.index(value)] # replaced token_ids.index(value) with int(value)-1
                counter = 1
                for cue in cues[int(value):]:                      
                    if cue == begin_cue:
                        break
                    elif cue == inside_cue:
                        sentence_augmented[token_ids.index(value)+counter] = inside_augment + sentence[token_ids.index(value)+counter] # replaced token_ids.index(value) with int(value)-1
                    counter += 1
                sentences.append(sentence_augmented)
                scopes = []
                for scope in sentence_labels:
                    if str(value) in [individual_scope.strip().strip('\n').strip() for individual_scope in scope.split(',')]:
                        scopes.append(1)
                    else:
                        scopes.append(0)
                labels.append(scopes)
        return sentences, labels
    
    sentences = []
    labels = []

    if goal_class == 'cue':
        if goal_phen == 'neg': class_index = 10
        else: class_index = 12
        for filepath in [file.path for file in os.scandir(data_folder_path) if file.name.endswith('tsv')]:
            with open(filepath, 'r', encoding='utf-8') as file:
                file.readline() #throw away headers
                sentence = []
                sentence_labels = []
                for line in file:
                    if line[0] != '#':
                        split_line = line.split('\t')
                        if ('-' not in split_line[0]) and ('DESCR.' not in split_line[class_index]):
                            sentence.append(split_line[1])
                            sentence_labels.append(split_line[class_index])
                    elif line[0] == '#' and len(sentence)>0:
                        sentences.append(sentence)
                        labels.append(sentence_labels)
                        sentence = []
                        sentence_labels = []
                sentences.append(sentence)
                labels.append(sentence_labels)

    elif goal_class == 'scope':
        if goal_phen == 'neg': 
            class_index = 11
            begin_cue = 'BN'
            inside_cue = 'IN'
            begin_augment = "token[B-NEG]"
            inside_augment = "token[I-NEG]"
        else:
            class_index = 13
            begin_cue = 'BS'
            inside_cue = 'IS'
            begin_augment = "token[B-SPEC]"
            inside_augment = "token[I-SPEC]"
        for filepath in [file.path for file in os.scandir(data_folder_path) if file.name.endswith('tsv')]:
            with open(filepath, 'r', encoding='utf-8') as file:
                file.readline() #throw away headers
                sentence = []
                sentence_labels = []
                token_ids = []
                cues = []
                for line in file:
                    if line[0] != '#':
                        split_line = line.split('\t')
                        if ('-' not in split_line[0]) and ('DESCR.' not in split_line[class_index]):
                            token_ids.append(split_line[0])
                            sentence.append(split_line[1])
                            cues.append(split_line[class_index-1])
                            sentence_labels.append(split_line[class_index])
                    elif (line[0] == '#') and (len(sentence)>0):
                        sentences, labels = check_and_augment_data(begin_cue, inside_cue, begin_augment, inside_augment, cues, sentence_labels, sentence, token_ids, sentences, labels)
                        sentence = []
                        sentence_labels = []
                        token_ids = []
                        cues = []
                sentences, labels = check_and_augment_data(begin_cue, inside_cue, begin_augment, inside_augment, cues, sentence_labels, sentence, token_ids, sentences, labels)

    return sentences, labels

In [None]:
def load_undersampled_data(data_folder_path, goal_class = 'cue', goal_phen = 'spec', num_zero_sentences = 0):  
    sentences = []
    labels = []
    undersampling_counter = 0
    
    if goal_class == 'cue':
        if goal_phen == 'neg': raise NotImplementedError()
        else: class_index = 12
        for filepath in [file.path for file in os.scandir(data_folder_path) if file.name.ensdwith('tsv')]:
            with open(filepath, 'r', encoding='utf-8') as file:
                file.readline() #throw away headers
                sentence = []
                sentence_labels = []
                for line in file:
                    if line[0] != '#':
                        split_line = line.split('\t')
                        if '-' not in split_line[0] and 'DESCR.' not in split_line[class_index]:
                            sentence.append(split_line[1])
                            sentence_labels.append(split_line[class_index])
                    elif line[0] == '#' and len(sentence)>0:
                        if 'BS' in sentence_labels:
                            sentences.append(sentence)
                            labels.append(sentence_labels)
                        elif undersampling_counter < num_zero_sentences:
                            sentences.append(sentence)
                            labels.append(sentence_labels)
                            undersampling_counter += 1
                        sentence = []
                        sentence_labels = []
                if 'BS' in sentence_labels:
                    sentences.append(sentence)
                    labels.append(sentence_labels)
                elif undersampling_counter < num_zero_sentences:
                        sentences.append(sentence)
                        labels.append(sentence_labels)

    else: raise NotImplementedError()

    return sentences, labels

### Label mapping (for cue detection)

In [None]:
def label_mapping(labels):
    unique_values = set()
    for label_list in labels:
        for label in label_list:
            unique_values.add(label)
    mapping = {}
    for value in unique_values:
        if value == 'O':
            mapping[value] = 0
        elif value[0] == 'B':
            mapping[value] = 1
        elif value[0] == 'I':
            mapping[value] = 2
        else:
            raise ValueError("Found inconsistent value in 'labels' (labels are supposed to be IOB2 tags).")
    labels = [[mapping[label] for label in label_list] for label_list in labels]
    return labels

### BERT tokenization

In [None]:
def bert_tokenization(tokenizer, text, labels):
    tokens_dict = tokenizer(text, return_tensors ='pt', is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True)
    for sentence_counter, mapping in enumerate(tokens_dict['offset_mapping']):
        for token_counter, indices in enumerate(mapping):
            if int(indices[0]) == 0 and int(indices[1]) != 0:
                label = labels[sentence_counter][token_counter]
            elif int(indices[1]) == 0:
                labels[sentence_counter].insert(token_counter, -100)
            elif int(indices[0])!=0:
                labels[sentence_counter].insert(token_counter, label)
    return tokens_dict, labels

### Dev/train/test split

In [None]:
def split(text, labels, test_size, random_state, stratify=None):
    if stratify != None:
        stratify = [] 
        # the labels in 'labels' are referred to tokens. We need a sentence-related label
        # to perform a stratified split, as it is text, and not single tokens,
        # that we allocate to dev/train/test sets. This sentence-related label will be 1
        # if the sentence contains at least 1 negative/speculative cue, 0 otherwise.
        for label_list in labels:
            if 1 in label_list:
                stratify.append(1)
            else:
                stratify.append(0)
    if isinstance(text, dict):
        return train_test_split(text['input_ids'], text['token_type_ids'], text['attention_mask'], text['offset_mapping'], labels, test_size = test_size, random_state = random_state, stratify = stratify)
    else:
        return train_test_split(text, labels, test_size = test_size, random_state = random_state, stratify = stratify)

### Functions for training

In [None]:
def train_and_evaluate(model, num_epochs, loss_function, optim, gradient_accumulation_steps, goal_class, goal_phen, models_folder_path, logs_folder_path, patience, use_f1_as_stopping_metric, hyper_dict, data_loader_train, data_loader_val, device):
    
    def print_log(epoch_number, train_loss, val_loss, cr_train, cm_train, cr_val, cm_val, log_file_path):
        cr_train = cr_train.split('\n')
        cr_train[0] = cr_train[0]+" "*2+f"loss: {train_loss:.8f}"
        cr_train = "\n".join(cr_train)
        cr_val = cr_val.split('\n')
        cr_val[0] = cr_val[0]+" "*2+"loss:"+" "*18+f"{val_loss:.8f}"
        cr_val = "\n".join(cr_val)
        with open(log_file_path,'a') as file:
            print(f"Epoch {epoch_number} (training)", file=file)
            print(cr_train, file=file)
            print(cm_train, file=file)
            print(f"Epoch {epoch_number} (validation)", file=file)
            print(cr_val, file=file)
            print(cm_val, file=file)
        print(f"Epoch {epoch_number} (training/validation): loss: {train_loss}/{val_loss}")
        
    def predict_for_original_tokens(batch, predicted_labels, y_true, y_pred):
        m = torch.nn.Softmax(dim=1)
        probs = m(predicted_labels)
        batch_labels = torch.reshape(batch['labels'],(-1,)).tolist()
        new_probs = []
        prob_sub_token = []
        for cnt_indices, indices in enumerate(batch['offset_mapping'].flatten(end_dim=1)):
            if indices[0].item() == 0 and indices[1].item() != 0:
                if len(prob_sub_token) != 0:
                    prob_zero = sum([coppia_prob[0] for coppia_prob in prob_sub_token])
                    prob_uno = sum([coppia_prob[1] for coppia_prob in prob_sub_token])                    
                    new_probs[-1][0] = (new_probs[-1][0] + prob_zero)/(len(prob_sub_token)+1)
                    new_probs[-1][1] = (new_probs[-1][1] + prob_uno)/(len(prob_sub_token)+1)
                    if cue_spec_detection:
                        prob_due = sum([coppia_prob[2] for coppia_prob in prob_sub_token])
                        new_probs[-1][2] = (new_probs[-1][2] + prob_due)/(len(prob_sub_token)+1)
                    prob_sub_token = []
                new_probs.append(probs[cnt_indices].tolist())
                y_true.append(batch_labels[cnt_indices])
            elif indices[0].item() != 0:
                if cue_spec_detection:
                    prob_sub_token.append([probs[cnt_indices][0].item(),probs[cnt_indices][1].item(),probs[cnt_indices][2].item()])
                else:
                    prob_sub_token.append([probs[cnt_indices][0].item(),probs[cnt_indices][1].item()])
        preds = torch.argmax(torch.tensor(new_probs), dim=1).tolist()
        y_pred.extend(preds)
        return y_true, y_pred
    
    if goal_class not in ('cue','scope'):
        raise ValueError("'goal_class' must be either 'cue' or 'scope'.")
    if goal_phen not in ('neg','spec'):
        raise ValueError("'goal_phen' must be either 'neg' or 'spec'.")
    cue_spec_detection = True if goal_class == 'cue' and goal_phen =='spec' else False #replace with check over number of possible class values
    log_file_path = os.path.join(logs_folder_path, f'log_{goal_class}_{goal_phen}.txt')
    if os.path.exists(log_file_path):
        with open(log_file_path, 'r') as file:
            for line in file:
                if "***ID_NUMBER=" in line:
                    id_model = int(re.search(r'[0-9]+',line)[0])+1
    else:
        id_model = 0
    with open(log_file_path, 'a') as file:     
        print('------------------------------------------------------------------------------', file=file)
        print(f"***ID_NUMBER={id_model}***",file=file)
        for key in hyper_dict:
            print(f'{key} : {hyper_dict[key]}', file=file)
    best_model_file_path = os.path.join(models_folder_path, f'best_model_{goal_class}_{goal_phen}_{id_model}.pt')
    best_val_loss = float('inf')
    best_val_f1 = float('-inf')
    patience_counter_val = patience
    early_stopping = False            
    for epoch in range(num_epochs):
        y_true = []
        y_pred = []
        model.train()
        train_loss = 0
        for batch_counter, batch in enumerate(data_loader_train):
            if (batch_counter != 0 and batch_counter % gradient_accumulation_steps == 0) or (batch_counter == (len(data_loader_train)-1)):
                optim.step() 
                optim.zero_grad()
            predicted_labels = model(batch)
            loss = loss_function(predicted_labels.to(device), torch.reshape(batch['labels'],(-1,))).to(device)
            loss.backward()
            train_loss += loss.item() 
            y_true, y_pred = predict_for_original_tokens(batch, predicted_labels, y_true, y_pred)
        train_loss /= (batch_counter+1)
        cr_train = classification_report(y_true, y_pred, zero_division=0)
        cm_train = confusion_matrix(y_true, y_pred)
        model.eval()
        with torch.no_grad():
            y_true = []
            y_pred = []
            val_loss = 0
            for batch in data_loader_val:
                predicted_labels = model(batch)
                val_loss += loss_function(predicted_labels.to(device), torch.reshape(batch['labels'],(-1,))).item()
                y_true, y_pred = predict_for_original_tokens(batch, predicted_labels, y_true, y_pred)
            val_loss /= len(data_loader_val)
            cr_val = classification_report(y_true, y_pred, zero_division=0)
            cm_val = confusion_matrix(y_true, y_pred)
            print_log(epoch, train_loss, val_loss, cr_train, cm_train, cr_val, cm_val, log_file_path)
            if use_f1_as_stopping_metric == True:
                if cue_spec_detection:
                    val_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
                else:
                    val_f1 = f1_score(y_true, y_pred, zero_division=0)
                if val_f1 >= best_val_f1: 
                    best_val_f1 = val_f1
                    torch.save(model.state_dict(), best_model_file_path)
                    patience_counter_val = patience
                    best_model_string = f"Best model was obtained at epoch {epoch}, with validation f1 {val_f1} and loss {val_loss}"
                elif patience_counter_val>0:
                    patience_counter_val -= 1
            else:
                if cue_spec_detection:
                    val_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
                else:
                    val_f1 = f1_score(y_true, y_pred, zero_division=0)
                if val_loss <= best_val_loss:
                    best_val_loss = val_loss
                    torch.save(model.state_dict(), best_model_file_path)
                    patience_counter_val = patience
                    best_model_string = f"Best model was obtained at epoch {epoch}, with validation f1 {val_f1} and loss {val_loss}"
                elif patience_counter_val>0:
                    patience_counter_val -= 1
            if patience_counter_val == 0:
                print(f"Early stopping at epoch {epoch}. Model parameters saved in 'best_model_{goal_class}_{goal_phen}_{id_model}.pt'. {best_model_string}")
                early_stopping = True
                with open(log_file_path, 'a') as file:
                    print(f"Early stopping at epoch {epoch}. {best_model_string}", file=file)
                break
    if early_stopping == False:
        print(f"Training completed. Model parameters saved in 'best_model_{goal_class}_{goal_phen}_{id_model}.pt'. {best_model_string}")
        with open(log_file_path, 'a') as file:
            print(f"{best_model_string}", file=file)

In [None]:
def grid_search(parameters, goal_class, goal_phen, models_folder_path, logs_folder_path, device):
    make_reproducible() # just to be on the safe side, may delete later

    configs = list(itertools.product(*parameters))
    print(f"There's {len(configs)} configurations in total. Gonna start running them now...")
    for num_config, config in enumerate(configs):
        make_reproducible() # just to be on the safe side, may delete later
        
        sentences, labels = load_data(goal_class, goal_phen, data_folder_path)
        if goal_class == 'cue':
            labels = label_mapping(labels)

        hidden_size_rnn = config[0]
        num_layers_rnn = config[1]
        dropout_prob = config[2]
        learning_rate = config[3]
        weight_decay = config[4]
        batch_size = config[5]
        gradient_accumulation_steps = config[6]
        bert_name = config[7]
        use_f1_as_stopping_metric = config[8]
        tokenizer = AutoTokenizer.from_pretrained(bert_name)
        bert = AutoModel.from_pretrained(bert_name)

        output_dim = 3 if goal_class == 'cue' and goal_phen == 'spec' else 2 #output of the whole model: to be set by user, currently cue_spec is the only class to have 3 possible values (all others have 2)
        loss_function = torch.nn.CrossEntropyLoss(ignore_index=-100).to(device) # -100 is the label assigned to Bert's special tokens ([CLS],[SEP]) and padding
        num_epochs = 100
        patience = 30

        tokens_dict, labels = bert_tokenization(tokenizer, sentences, labels)
        tokens_dict = dict(tokens_dict)
        
        stratify = True if goal_class == 'cue' else None
        x_dev = {}
        x_test = {}
        x_dev['input_ids'], x_test['input_ids'], x_dev['token_type_ids'], x_test['token_type_ids'], x_dev['attention_mask'], x_test['attention_mask'], x_dev['offset_mapping'], x_test['offset_mapping'], labels_dev, labels_test = split(tokens_dict, labels, test_size=0.2, random_state=1, stratify = stratify)
        x_train = {}
        x_val = {}
        x_train['input_ids'], x_val['input_ids'], x_train['token_type_ids'], x_val['token_type_ids'], x_train['attention_mask'], x_val['attention_mask'], x_train['offset_mapping'], x_val['offset_mapping'], labels_train, labels_val = split(x_dev, labels_dev, test_size=0.2, random_state=1, stratify = stratify)

        data_loader_train = torch.utils.data.DataLoader(Dataset(x_train, labels_train),
                                                        batch_size = batch_size, shuffle = True)
        data_loader_val = torch.utils.data.DataLoader(Dataset(x_val, labels_val),
                                                    batch_size = batch_size, shuffle = False)


        model = Cue_scope_predictor(bert,hidden_size_rnn, num_layers_rnn, output_dim, dropout_prob)
        model.to(device)
        optim = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

        hyper_dict = {'INITIAL_WEIGHTS' : "Pytorch default",
                    'HIDDEN_SIZE_RNN': hidden_size_rnn,
                    'NUM_LAYERS_RNN': num_layers_rnn,
                    'DROPOUT_PROB': dropout_prob,
                    'LEARNING_RATE': learning_rate,
                    'WEIGHT_DECAY': weight_decay,
                    'LOSS_FUNCTION': str(loss_function),
                    'BATCH_SIZE': batch_size*gradient_accumulation_steps,
                    'NUM_EPOCHS': num_epochs,
                    'PATIENCE': patience,
                    'OPTIMISER': str(optim).replace('\n',''),
                    'BERTS_NAME' : bert_name}
        
        train_and_evaluate(model, num_epochs, loss_function, optim, gradient_accumulation_steps, goal_class, goal_phen, models_folder_path, logs_folder_path, patience, use_f1_as_stopping_metric, hyper_dict, data_loader_train, data_loader_val, device)
        print('-'*100)
        print()
        print(f"Just finished configuration number {num_config}. {len(configs)-1-num_config} still to go.")
        print()
        print('-'*100)

# Classes

### Model

In [None]:
class Cue_scope_predictor(torch.nn.Module):
    def __init__(self, bert, hidden_size_rnn, num_layers_rnn, output_dim, dropout_prob):
        super().__init__()
        self.embedder = bert
        self.num_layers_rnn = num_layers_rnn
        embedding_dim = bert.config.to_dict()['hidden_size']
        if self.num_layers_rnn != 0:
            self.rnn = torch.nn.LSTM(input_size = embedding_dim, hidden_size = hidden_size_rnn, num_layers=self.num_layers_rnn, batch_first=True, bidirectional=True)
            self.final = torch.nn.Linear(2*hidden_size_rnn, output_dim)
        else:
            self.final = torch.nn.Linear(embedding_dim, output_dim)
        self.dropout_prob = dropout_prob
        if self.dropout_prob!= 0:
            self.dropout = torch.nn.Dropout(dropout_prob)

    def forward(self, tokens):
        for cnt in range(len(tokens['input_ids'])):
            embedding = self.embedder.forward(tokens['input_ids'][cnt].unsqueeze(0), tokens['attention_mask'][cnt].unsqueeze(0), tokens['token_type_ids'][cnt].unsqueeze(0)).last_hidden_state
            if self.num_layers_rnn != 0:
                lhs_rnn = self.rnn(embedding)[0] 
                if self.dropout_prob != 0:
                    lhs_rnn = self.dropout(lhs_rnn)
                pred = self.final(lhs_rnn)
            else:
                if self.dropout_prob != 0:
                    embedding = self.dropout(embedding)
                pred = self.final(embedding)
            if cnt == 0:
                preds = pred
            else:
                preds = torch.cat((preds, pred), 1)
        # bisogna decidere cosa fare con gli embedding di token che vengono scomposti in subword: prendere solo quello della prima delle subword, combinarli 
        # facendo la media, sommarli...?
        return preds.squeeze(0)

### Pytorch Dataset (for batching)

In [None]:
# based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
class Dataset(torch.utils.data.Dataset):
    def __init__(self, tokens, cues):
        self.tokens = tokens
        self.cues = cues
        
    def __getitem__(self, id_):
        item = {key: val[id_].to(device) for key,val in self.tokens.items() if key != 'offset_mapping'}
        item['labels'] = torch.tensor(self.cues[id_]).to(device)
        item['offset_mapping'] = self.tokens['offset_mapping'][id_]
        return item
    
    def __len__(self):
        return len(self.cues)

# Path assignment

In [None]:
input_ok = False
while input_ok == False:
    dir_path = os.path.realpath(input("Insert your path-string (final slash not important)"))
    input_ok = os.path.exists(dir_path)
    if input_ok == False:
        print(f'dir_path {dir_path} does not exist on this machine. Insert correct path.')

data_folder_path = os.path.join(dir_path, 'Data')
assert os.path.exists(data_folder_path), f'All data should be put in a namesake folder ("Data") located at {data_folder_path}'
models_folder_path = os.path.join(dir_path, 'Models')
if os.path.exists(models_folder_path) == False:
    print("Folder 'Models' not found in expected path. Proceeding to create it. All trained models will be saved here.")
    os.makedirs(models_folder_path)
logs_folder_path = os.path.join(dir_path, 'Logs')
if os.path.exists(logs_folder_path) == False:
    print("Folder 'Logs' not found in expected path. Proceeding to create it. All training logs will be saved here.")
    os.makedirs(logs_folder_path)

Insert your path-string (final slash not important)/content/drive/My Drive/Tirocinio NLP e tesi


# Testing

In [None]:
bert_name = 'dbmdz/bert-base-italian-xxl-cased' # insert BERT model used
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert = AutoModel.from_pretrained(bert_name)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model = Cue_scope_predictor(bert, 200, 1, 2, 0)
model.load_state_dict(torch.load(os.path.join(models_folder_path,'best_model_cue_neg_4.pt'))) # insert model name
model.to(device)
goal_class = 'cue'# insert goal_class
goal_phen = 'neg'# insert goal_phen

In [None]:
sentences, labels = load_data(goal_class, goal_phen, data_folder_path)
if goal_class == 'cue':
    labels = label_mapping(labels)
tokens_dict, labels = bert_tokenization(tokenizer, sentences, labels)
tokens_dict = dict(tokens_dict)
stratify = True if goal_class == 'cue' else None
x_dev = {} 
x_test = {}
x_dev['input_ids'], x_test['input_ids'], x_dev['token_type_ids'], x_test['token_type_ids'], x_dev['attention_mask'], x_test['attention_mask'], x_dev['offset_mapping'], x_test['offset_mapping'], labels_dev, labels_test = split(tokens_dict, labels, test_size=0.2, random_state=1, stratify = stratify)

In [None]:
data_loader_test = torch.utils.data.DataLoader(Dataset(x_test, labels_test),
                                                    batch_size = 10, shuffle = False)

In [None]:
def evaluate(model, data_loader_test):
    
    def predict_for_original_tokens(batch, predicted_labels, y_true, y_pred):
        m = torch.nn.Softmax(dim=1)
        probs = m(predicted_labels)
        batch_labels = torch.reshape(batch['labels'],(-1,)).tolist()
        new_probs = []
        prob_sub_token = []
        for cnt_indices, indices in enumerate(batch['offset_mapping'].flatten(end_dim=1)):
            if indices[0].item() == 0 and indices[1].item() != 0:
                if len(prob_sub_token) != 0:
                    prob_zero = sum([coppia_prob[0] for coppia_prob in prob_sub_token])
                    prob_uno = sum([coppia_prob[1] for coppia_prob in prob_sub_token])
                    new_probs[-1][0] = (new_probs[-1][0] + prob_zero)/(len(prob_sub_token)+1)
                    new_probs[-1][1] = (new_probs[-1][1] + prob_uno)/(len(prob_sub_token)+1)
                    prob_sub_token = [] 
                new_probs.append(probs[cnt_indices].tolist())
                y_true.append(batch_labels[cnt_indices])
            elif indices[0].item() != 0:
                prob_sub_token.append([probs[cnt_indices][0].item(),probs[cnt_indices][1].item()])
        preds = torch.argmax(torch.tensor(new_probs), dim=1).tolist()
        y_pred.extend(preds)
        return y_true, y_pred

    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in data_loader_test:
            predicted_labels = model(batch)
            y_true, y_pred = predict_for_original_tokens(batch, predicted_labels, y_true, y_pred)
    print(classification_report(y_true, y_pred))

In [None]:
evaluate(model, data_loader_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10082
           1       0.95      0.99      0.97        92

    accuracy                           1.00     10174
   macro avg       0.97      0.99      0.98     10174
weighted avg       1.00      1.00      1.00     10174



------------------------

In [None]:
# less memory efficient ('cos all lines of all files stored in memory at once), more readable
class TSVDataset:

    def __init__(self, data_folder_path):
        self.data_folder_path = data_folder_path
        self.filepaths = sorted([file.path for file in os.scandir(self.data_folder_path) if file.name.endswith('tsv')])
        print(f'Using dataset from {self.data_folder_path}')
        print(f'Files loaded: {[os.path.basename(file) for file in self.filepaths]}')

        df = pd.DataFrame()
        for filepath in sorted(self.filepaths):
            df = df.append(pd.read_csv(filepath, sep='\t', quoting = 3))
        df = df.reset_index(drop=True)
        self.original_df = copy.deepcopy(df)

        comment_indices = df[df.ID.astype(str).str.startswith('#')].index
        df = df.drop(comment_indices)
        multitoken_indices = df[df.ID.astype(str).str.contains('-')].index
        df = df.drop(multitoken_indices)
        descr_indices = df[df['CUE_NEG'] == 'DESCR.'].index
        df = df.drop(descr_indices)
        df = df.reset_index(drop=True)
        self.tokens_df = df

    def get_original_df(self):
        return self.original_df

    def get_tokens_df(self):
        return self.tokens_df

    def preprocess_data(self, goal_class, goal_phen):
        df = copy.deepcopy(self.tokens_df)
        sentences = []
        labels = []
        goal_phen = goal_phen.upper()
        cue_column = f'CUE_{goal_phen}'

        if goal_class == 'cue':
            last_index = 0
            for index in df[df.ID == '1'].index:
                if index == 0: continue
                sentences.append(df['FORM'][last_index:index].tolist())
                labels.append(df[cue_column][last_index:index].tolist())
                last_index = index

        else:
            scope_column = f'SCOPE_{goal_phen}'
            begin_cue = 'BS' if goal_phen == 'SPEC' else 'BN'
            inside_cue = 'IS' if goal_phen == 'SPEC' else 'IN'
            begin_augment = 'token[B-SPEC]' if goal_phen =='SPEC' else 'token[B-NEG]'
            inside_augment = 'token[I-SPEC]' if goal_phen =='SPEC' else 'token[I-NEG]'

            begin_sentence = 0
            for end_sentence in df[df.ID == '1'].index:
                if end_sentence == 0: continue
                cues = df[cue_column][begin_sentence:end_sentence][df[cue_column]==begin_cue]
                cue_indices = cues.index
                for counter_index, cue_index in enumerate(cue_indices):
                    sentence_tokens = copy.deepcopy(df['FORM'][begin_sentence:end_sentence])
                    sentence_tokens.loc[cue_index] = begin_augment + sentence_tokens.loc[cue_index]
                    next_cue_index = end_sentence
                    if counter_index < len(cue_indices)-1:
                        next_cue_index = cue_indices[counter_index+1]
                    candidate_inside_cues = df[cue_column][cue_index:next_cue_index]
                    for inside_cue_index, cue in zip(candidate_inside_cues.index,candidate_inside_cues):
                        if cue == inside_cue:
                            sentence_tokens.loc[inside_cue_index] = inside_augment + sentence_tokens.loc[inside_cue_index]
                    sentence_labels = []
                    cue_id = df['ID'].loc[cue_index] 
                    sentence_scopes = copy.deepcopy(df[scope_column][begin_sentence:end_sentence])
                    for token_scopes in sentence_scopes:
                        token_scopes_list = token_scopes.split(',')
                        clean_token_scopes = [scope.strip().strip('\n').strip() for scope in token_scopes_list]
                        if cue_id in clean_token_scopes:
                            sentence_labels.append(1)
                        else:
                            sentence_labels.append(0)
                    sentences.append(sentence_tokens.tolist())
                    labels.append(sentence_labels)
                begin_sentence = end_sentence
        return sentences, labels

In [None]:
ds = TSVDataset(f'{dir_path}/Data')