In [56]:
# !pip install torch torchvision transformers



In [57]:
import numpy as np
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import torch
from transformers import BertTokenizer, get_linear_schedule_with_warmup
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from transformers import BertModel

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
train = pd.read_csv('/content/sample_data/train_label.csv')
dev = pd.read_csv('/content/sample_data/dev_label.csv')
test = pd.read_csv('/content/sample_data/test.csv')

In [59]:
train = train[['text', 'label']]
dev = dev[['text', 'label']]
test = test[['text']]

In [60]:
def process_tweet(tweet):
    
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'http\S+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            tweets_clean.append(word)
            
    tweet_clean = ' '.join(tweets_clean)

    return tweet_clean

In [61]:
processed_train = [process_tweet(t) for t in train.text]
processed_dev = [process_tweet(t) for t in dev.text]
processed_test = [process_tweet(t) for t in test.text]

In [62]:
train.to_csv('train_text.csv', index = False)
dev.to_csv('dev_text.csv', index = False)
test.to_csv('test_text.csv', index = False)

In [63]:
class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename)

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        if 'label' in self.df.columns:
            label = self.df.loc[index, 'label']
        else: 
            label = None
        
        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        if 'label' in self.df.columns:
            return tokens_ids_tensor, attn_mask, label
        else:
            return tokens_ids_tensor, attn_mask

In [64]:
#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = SSTDataset(filename = 'train_text.csv', maxlen = 512)
dev_set = SSTDataset(filename = 'dev_text.csv', maxlen = 512)
test_set = SSTDataset(filename = 'test_text.csv', maxlen = 512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 16, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 16, num_workers = 2)
test_loader = DataLoader(test_set, batch_size = 16, num_workers = 2)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [65]:
class TextClassifier(nn.Module):

    def __init__(self):
        super(TextClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [66]:
gpu = 0 #gpu ID

print("Creating the text classifier, initialised with pretrained BERT-BASE parameters...")
net = TextClassifier()
net.cuda(gpu)
print("Done creating the sentiment classifier.")

Creating the text classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored

In [None]:
import torch.nn as nn
import torch.optim as optim

num_epoch = 7

total_steps = total_steps = len(train_loader) * num_epoch

# train_len = len(train)
# pos = sum(train.label)
# neg = train_len - pos
# pos_weight = torch.tensor([neg/pos, 1])

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 5e-5, eps = 1e-8, weight_decay = 1e-2)
scheduler = get_linear_schedule_with_warmup(opti, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
import time

def train(net, criterion, opti, scheduler, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
            
            #Scheculer step
            scheduler.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'model_v2.dat')

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
#fine-tune the model
train(net, criterion, opti, scheduler, train_loader, dev_loader, num_epoch, gpu)

In [None]:
def predict(net, dataloader, gpu):
    net.eval()

    final_preds = []
    with torch.no_grad():
        for seq, attn_masks in dataloader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long().squeeze().cpu().detach().numpy()
            final_preds.append(soft_probs)
    final_preds = np.hstack(final_preds)

    return final_preds

In [None]:
def get_results(net, test_loader, gpu, model_path, save_path):

    net.load_state_dict(torch.load(model_path))

    preds = predict(net, test_loader, gpu)

    results = pd.DataFrame({'Predicted': preds})
    results.index.name = 'Id'
    results.to_csv(save_path)


In [None]:
# get_results(net, test_loader, gpu, '/content/model.dat', 'bertbase_test.csv')