In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import torch.nn as nn
from torch import optim
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertModel
import torch
import copy
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import time, sys
import numpy as np

In [None]:
from google.colab import drive 
drive.mount('/content/drive',)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
class DataPrep(Dataset):
    def __init__(self, filedata,tokenizer,label_to_id = None, category_to_id=None):
        super(DataPrep, self).__init__()
        self.tokenizer = tokenizer
        self.filedata = filedata
        columns = ['opinion', 'categories', 'word', 'position', 'sentence']
        self.data = pd.read_csv(filedata, sep = '\t', names=columns)
        
        if label_to_id is not None :
            self.label_to_id = label_to_id
        else :
            possible_labels = self.data.opinion.unique().tolist()
            self.label_to_id = {label:k for k, label in enumerate(possible_labels)}

        if category_to_id is not None :
            self.category_to_id = category_to_id 
        else :
            possible_categories = self.data.categories.unique().tolist()
            self.category_to_id = {label:k for k, label in enumerate(possible_categories)}
        
        self.id_to_label = {id:label for label, id in self.label_to_id.items()}
        self.id_to_category = {id:category for category, id in self.category_to_id.items()}
        
        #self.data = self.dftoid(self.data)

        opinion = self.data.opinion.tolist()
        sentences = self.data.sentence.tolist()
        position = self.data.position.tolist()
        categories = self.data.categories.tolist()

        sentences_tokenized = []
        ind_words_to_guess = []
        labels = []
        category_list = []
        self.token_id_begin, self.token_id_end, self.token_id_pad = self.tokenizer("", return_token_type_ids=False, return_attention_mask=False, padding="max_length", max_length=3)['input_ids']


        for i, s in enumerate(sentences):
            interval = position[i].split(":")
            ind0, ind1 = int(interval[0]), int(interval[1])
            tokens, ind_start, ind_end = self.convert_sentence_to_tokens(s, ind0, ind1)
            sentences_tokenized.append(tokens)
            ind_words_to_guess.append([ind_start, ind_end])
            labels.append(self.label_to_id[opinion[i]])
            category_list.append(self.category_to_id[categories[i]])

        self.sentences_tokenized = sentences_tokenized
        self.ind_words_to_guess = ind_words_to_guess
        self.labels = labels
        self.category_list = category_list




    def __getitem__(self, index) :
        return self.sentences_tokenized[index], self.ind_words_to_guess[index], self.labels[index], self.category_list[index]
        
    def __len__(self):
        return len(self.labels)


    # def dftoid(self, df):
        
    #     df['opinion'] = [self.label_to_id[label] for label in df.opinion]
    #     df['categories'] = [self.category_to_id[cat] for cat in df.categories]
    #     return df

    def convert_sentence_to_tokens(self, sentence, ind0, ind1):
        tokens_list = [self.token_id_begin]
        if ind0 > 0 : 
            tokens_list += self.tokenizer.encode_plus(sentence[:ind0],add_special_tokens=True, return_token_type_ids =False, return_attention_mask =False)['input_ids'][1:-1]
            
        ind_start = len(tokens_list)
        tokens_list += self.tokenizer.encode_plus(sentence[ind0:ind1],add_special_tokens=True, return_token_type_ids =False, return_attention_mask =False)['input_ids'][1:-1]
        ind_end = len(tokens_list)

        if ind1 < len(sentence) :
            tokens_list += self.tokenizer.encode_plus(sentence[ind1:], add_special_tokens=True,return_token_type_ids =False, return_attention_mask =False)['input_ids'][1:-1]

        tokens_list += [self.token_id_end]

        return tokens_list, ind_start, ind_end

In [None]:
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        
      

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.common_linear = nn.Linear(2*768,768)

        for i in range(12):
          setattr(self, 'linear_%i' % i, nn.Linear(768, 3))

    def forward(self, data):
        features = self.bert(data["data"])[0]
        last_hidden_state_cls = features[:, 0, :]
        output = []
        for k, feat in enumerate(features) :
            mean_feat = torch.mean(feat[data["words_id"][k][0]:data["words_id"][k][1],:], dim=0)
            mean_feat = torch.cat((mean_feat, last_hidden_state_cls[k,:]),0)
            mean_feat = torch.relu(self.common_linear(mean_feat))
            out = getattr(self, 'linear_' + str(int(data["categories"][k])))(mean_feat)
            output.append(out)

        return torch.stack(output)

In [None]:
def custom_collate_fn(batch, pad_id):
    length_to_pad = max([len(item[0]) for item in batch])
    sentences = []
    ind_words_to_guess = []
    labels = []
    category_list = []
    for item in batch :
      sentences.append(item[0] + (length_to_pad-len(item[0])) * [pad_id])
      ind_words_to_guess.append(item[1])
      labels.append(item[2])
      category_list.append(item[3])
    return torch.tensor(sentences), torch.tensor(ind_words_to_guess), torch.tensor(labels), torch.tensor(category_list)

def train_epch(dataloader, criterion, optimizer, model, device,scheduler, epch):
    model.train()
    train_loss = 0.
    for _, (data, words_id, labels, categories) in enumerate(dataloader):
        data = data.to(device)
        words_id = words_id.to(device)      
        labels = labels.to(device)
        categories = categories.to(device)

        outputs = model({"data":data, "words_id":words_id, "categories":categories})

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    print("Training done epoch {}!     Loss : {}".format(epch, round(train_loss,3)))

def eval_epch(dataloader,criterion, model, device):
    model.eval()
    correct = 0
    total = 0
    predicted_list = []
    with torch.no_grad():
        for _, (data, words_id, labels, categories) in enumerate(dataloader):
            data = data.to(device)
            words_id = words_id.to(device)      
            categories = categories.to(device)
            labels = labels.to(device)

            outputs = model({"data":data, "words_id":words_id, "categories":categories})

            _, predicted = torch.max(outputs, 1)
            predicted_list.append(int(predicted))

            total += labels.size(0) 
            correct += (predicted == labels).sum().item()
            loss = criterion(outputs, labels)

    print("Evaluation done! Accuracy : {}%".format(round(correct/total*100,1)))
    return  loss, predicted_list




In [None]:
class Classifier:
    def __init__(self):
        #self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = None
        self.loss_fn = nn.CrossEntropyLoss()

        #attribute that will be filled during training
        self.label_to_id = None
        self.category_to_id = None


    def initialize_model(self,train_dataloader, epochs, device, catego = True, freeze_bert=False):
        """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
        """
        # Instantiate Bert Classifier
        self.model = BertClassifier().to(device)

        # Tell PyTorch to run the model on GPU
        self.model.to(device)

        # Create the optimizer
        self.optimizer = AdamW(self.model.parameters(),
                        lr=8e-5,    # Default learning rate
                        eps=1e-8    # Default epsilon value
                        )

        # Total number of training steps
        total_steps = len(train_dataloader) * epochs

        # Set up the learning rate scheduler
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                    num_warmup_steps=4, # Default value
                                                    num_training_steps=total_steps)
        # exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
        #return bert_classifier, optimizer, scheduler



    #############################################
    def train(self, train_filename, dev_filename, device):
        """Trains the classifier model on the training set stored in file trainfile"""
        batch_size = 32
        epochs = 3
        train_dataset = DataPrep(train_filename, self.tokenizer)
        train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn= lambda b: custom_collate_fn(b, train_dataset.token_id_pad))
        #val_dataset = DataPrep("../data/devdata.csv", self.tokenizer, self.label_to_id, self.category_to_id)
        #val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn= lambda b: custom_collate_fn(b, val_dataset.token_id_pad))

        self.label_to_id = train_dataset.label_to_id
        self.category_to_id = train_dataset.category_to_id
        self.initialize_model(train_dataloader,epochs, device)
        #optimizer = optim.Adam(self.model.parameters(), lr=lr)
        #criterion = nn.CrossEntropyLoss()
        best_loss = np.inf

        for epch in range(epochs):
            train_epch(train_dataloader, self.loss_fn, self.optimizer, self.model, device,self.scheduler, epch)
        #     loss_eval,_ = eval_epch(val_dataloader,self.loss_fn, self.model, self.device) 
        #     print('Evaluation done: loss: '+str(loss_eval.item()))
        #     if loss_eval<best_loss:
        #       best_loss = loss_eval
        #       best_model = copy.deepcopy(self.model.state_dict())
        
        # self.model.load_state_dict(best_model)

            
    
    def predict(self, data_filename, device):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """
        val_dataset = DataPrep(data_filename, self.tokenizer, self.label_to_id, self.category_to_id)
        val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn= lambda b: custom_collate_fn(b, val_dataset.token_id_pad))
        _, predicted_list = eval_epch(val_dataloader,self.loss_fn, self.model, device)
        print(predicted_list)
        return [val_dataset.id_to_label[pred] for pred in predicted_list]

In [None]:
def set_reproducible():
    # The below is necessary to have reproducible behavior.
    import random as rn
    import os
    os.environ['PYTHONHASHSEED'] = '0'
    # The below is necessary for starting Numpy generated random numbers
    # in a well-defined initial state.
    np.random.seed(17)
    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.
    rn.seed(12345)



def load_label_output(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        return [line.strip().split("\t")[0] for line in f if line.strip()]



def eval_list(glabels, slabels):
    if (len(glabels) != len(slabels)):
        print("\nWARNING: label count in system output (%d) is different from gold label count (%d)\n" % (
        len(slabels), len(glabels)))
    n = min(len(slabels), len(glabels))
    incorrect_count = 0
    for i in range(n):
        if slabels[i] != glabels[i]: incorrect_count += 1
    acc = (n - incorrect_count) / n
    return acc*100





In [None]:
def train_and_eval(classifier, trainfile, devfile, testfile, run_id, device):
    print(f"\nRUN: {run_id}")
    print("  %s.1. Training the classifier..." % str(run_id))
    classifier.train(trainfile, devfile, device)
    print()
    print("  %s.2. Eval on the dev set..." % str(run_id), end="")
    slabels = classifier.predict(devfile, device)
    glabels = load_label_output(devfile)
    devacc = eval_list(glabels, slabels)
    print(" Acc.: %.2f" % devacc)
    testacc = -1
    if testfile is not None:
        # Evaluation on the test data
        print("  %s.3. Eval on the test set..." % str(run_id), end="")
        slabels = classifier.predict(testfile)
        glabels = load_label_output(testfile)
        testacc = eval_list(glabels, slabels)
        print(" Acc.: %.2f" % testacc)
    print()
    return (devacc, testacc)




In [None]:
#argparser = argparse.ArgumentParser()
#argparser.add_argument('-n', '--n_runs', help='Number of runs.', type=int, default=5)
#argparser.add_argument('-g', '--gpu', help='GPU device id on which to run the model', type=int)
#args = argparser.parse_args()
#device_name = "cpu" if args.gpu is None else f"cuda:{args.gpu}"
#device = torch.device(device_name)
device = "cuda"
#n_runs = args.n_runs
n_runs = 1
set_reproducible()
datadir = "/content/"
trainfile =  datadir + "traindata.csv"
devfile =  datadir + "devdata.csv"
testfile = None
# testfile = datadir + "testdata.csv"

In [None]:
# Runs
start_time = time.perf_counter()
devaccs = []
testaccs = []
for i in range(1, n_runs+1):
    classifier =  Classifier()
    devacc, testacc = train_and_eval(classifier, trainfile, devfile, testfile, i, device)
    devaccs.append(np.round(devacc,2))
    testaccs.append(np.round(testacc,2))
print('\nCompleted %d runs.' % n_runs)
total_exec_time = (time.perf_counter() - start_time)
print("Dev accs:", devaccs)
print("Test accs:", testaccs)
print()
print("Mean Dev Acc.: %.2f (%.2f)" % (np.mean(devaccs), np.std(devaccs)))
print("Mean Test Acc.: %.2f (%.2f)" % (np.mean(testaccs), np.std(testaccs)))
print("\nExec time: %.2f s. ( %d per run )" % (total_exec_time, total_exec_time / n_runs))


RUN: 1
  1.1. Training the classifier...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training done epoch 0!     Loss : 35.774
Training done epoch 1!     Loss : 21.922
Training done epoch 2!     Loss : 12.851

  1.2. Eval on the dev set...Evaluation done! Accuracy : 85.4%
[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,