In [None]:
!pwd
%cd drive/MyDrive/tweet-sentiment-extraction/
!pip install transformers

In [None]:
import torch
import tokenizers
import transformers
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('./data/train.csv').dropna().reset_index(drop=True)
df.drop(columns='textID', inplace=True)

In [None]:
a = [3292, 4105, 6256, 6783, 8612, 15546, 18345, 24991] ## Remove this indices because by adding them it increase MAX_LEN for our sentence tokens.

In [None]:
for i in a:
  df.drop(index=i, inplace=True)
  df = df.reset_index(drop=True)

In [None]:
import tokenizers
class TweetExtractionDataset(torch.utils.data.Dataset):
    def __init__(self, texts, selected_texts, sentiments):
        self.texts = texts
        self.selected_texts = selected_texts
        self.sentiments = sentiments
        self.max_len   = 64
        self.tokenizer = tokenizers.BertWordPieceTokenizer("./data/vocab.txt", lowercase=True)
    
    def __getitem__(self, item):
        question = " ".join(str(self.sentiments[item]).split())
        context = " ".join(str(self.texts[item]).split())
        answer = " ".join(str(self.selected_texts[item]).split())

        len_ans = len(answer)

        start_idx = -1
        end_idx   = -1


        for i in (idx for idx, item in enumerate(context) if item == answer[0]):
            if context[i:i+len_ans] == answer:
                start_idx = i
                end_idx   = i + len_ans
                break
        assert start_idx >= 0 & end_idx >= 0
        char_context = [0] * len(context)

        if start_idx!= -1 and end_idx!= -1:
            for i, c in enumerate(context[start_idx:end_idx]):
                if c != " ":
                    char_context[i+start_idx] = 1
        
        outputs = self.tokenizer.encode(context)
        ids     = outputs.ids
        tokens  = outputs.tokens
        offsets = outputs.offsets

        ids_vec = [0] * len(tokens[1:-1])

        for i, (o1, o2) in enumerate(offsets[1:-1]):
            if np.sum(char_context[o1:o2]) > 0:
                ids_vec[i] = 1

        
        start_idx = ids_vec.index(1)
        end_idx   = len(ids_vec) - 1 - ids_vec[::-1].index(1)

        
        ids            = self.tokenizer.encode(question).ids + ids[1:]
        token_type_ids = [0,0,0] + [1] * (len(tokens) - 1)
        masks          = [1] * len(ids)
        pad_len = self.max_len - len(ids)

        if pad_len > 0 :
            ids            = ids + [0] * pad_len
            token_type_ids = token_type_ids + [0] * pad_len
            masks          = masks + [0] * pad_len

        return {
            "ids" : torch.tensor(ids, dtype=torch.long),
            "token_type_ids" : torch.tensor(token_type_ids,dtype=torch.long),
            "masks"  : torch.tensor(masks,dtype=torch.long),
            "text" : context,
            "sentiment"  : question,
            "selected_text" :  answer,
            "start_idx" : torch.tensor(start_idx+3, dtype=torch.long),
            "end_idx" : torch.tensor(end_idx+3, dtype=torch.long)
        }

    def __len__(self):
        return len(self.texts)


dataset = TweetExtractionDataset(df.text, df.selected_text, df.sentiment)

In [None]:
train, test = train_test_split(df, test_size=0.15, random_state=42, stratify=df.sentiment.values)
train       = train.reset_index(drop=True)
test        = test.reset_index(drop=True)

In [None]:
train_dataset = TweetExtractionDataset(train.text, train.selected_text, train.sentiment)
test_dataset  = TweetExtractionDataset(test.text,  test.selected_text, test.sentiment)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, num_workers=0, batch_size=16)
test_data_loader  = torch.utils.data.DataLoader(test_dataset, num_workers=0, batch_size=16)

In [None]:
model       = transformers.BertForQuestionAnswering.from_pretrained('bert-base-uncased')
optimizer   = transformers.AdamW(model.parameters(), lr=2e-5)
total_steps = int(len(df) / 16 * 20 ) 
schedular   = transformers.get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
def train(model, data_loader, optimizer, schedular, device):

    model.train()
    total_train_loss = 0

    for _, data in tqdm(enumerate(data_loader), total = len(data_loader)):
        ids                  = data['ids'].to(device)
        masks                = data['masks'].to(device)
        token_type_ids       = data['token_type_ids'].to(device)
        start_logits_targets = data['start_idx'].to(device)
        end_logits_targets   = data['end_idx'].to(device)

        optimizer.zero_grad()

        outputs = model(ids, attention_mask=masks, token_type_ids=token_type_ids, start_positions=start_logits_targets, end_positions=end_logits_targets)

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        schedular.step()
    
    avg_train_loss = total_train_loss / len(data_loader)

    print(f"Average Loss during Training : {avg_train_loss}")

In [None]:
def test(model, data_loader, device):

    model.eval()
    total_test_loss    = 0
    start_accuracy     = 0
    end_accuracy       = 0

    for _ , data in tqdm(enumerate(data_loader), total=len(data_loader)):

        ids                  = data['ids'].to(device)
        masks                = data['masks'].to(device)
        token_type_ids       = data['token_type_ids'].to(device)
        start_logits_targets = data['start_idx'].to(device)
        end_logits_targets   = data['end_idx'].to(device)

        with torch.no_grad():

            outputs = model(ids, attention_mask=masks, token_type_ids=token_type_ids, start_positions=start_logits_targets, end_positions=end_logits_targets)

            loss = outputs.loss
            total_test_loss += loss.item()

            start_logits = np.argmax(outputs.start_logits.cpu().detach().numpy(),axis=1)
            end_logits = np.argmax(outputs.end_logits.cpu().detach().numpy(),axis=1)

            start_targets = start_logits_targets.cpu().detach().numpy()
            end_targets   = end_logits_targets.cpu().detach().numpy()

            start_accuracy += metrics.accuracy_score(start_targets, start_logits)
            end_accuracy  += metrics.accuracy_score(end_targets, end_logits)


    avg_test_loss = total_test_loss / len(data_loader)

    time.sleep(3)
    print(f"Average Test Loss : {avg_test_loss}")

    start_accuracy = start_accuracy / len(data_loader)
    end_accuracy   = end_accuracy / len(data_loader)


    accuracy = (start_accuracy + end_accuracy) / 2

    print(f"Accuracy : {accuracy}")
    ### This is not a good metrics for this problem it just for demo purpose we can use other metrics. I will optimize it later.

    if accuracy > best_accuarcy:
        torch.save(model.state_dict(), "saved_model.bin")
        best_accuracy = accuracy

In [None]:
best_accuarcy = 0
device = torch.device('cuda')

model.to(device)

for i in range(20):
    print(f"Epochs : {i}")
    train(model, train_data_loader, optimizer, schedular, device)
    test(model, test_data_loader, device)