In [2]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [166]:
train = pd.read_csv('lda_final_dataset.csv')\
    .groupby(by=['index']).first()\
    .reset_index()\
    .drop(['cosine_constitution', 'cosine_crpc', 'cosine_ipc'], axis=1)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6076 entries, 0 to 6075
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   index                6076 non-null   int64  
 1   processed_text       6076 non-null   object 
 2   label                6076 non-null   int64  
 3   split                6076 non-null   object 
 4   Topic_0              6076 non-null   float64
 5   Topic_1              6076 non-null   float64
 6   Topic_2              6076 non-null   float64
 7   Topic_3              6076 non-null   float64
 8   Dominant_Topic       6076 non-null   object 
 9   Perc_Dominant_Topic  6076 non-null   float64
dtypes: float64(5), int64(2), object(3)
memory usage: 474.8+ KB


In [167]:
train['Dominant_Topic'].str.extract(r'(?P<Dominant_Topic>\d)').astype(int)

Unnamed: 0,Dominant_Topic
0,3
1,3
2,3
3,1
4,3
...,...
6071,3
6072,1
6073,3
6074,3


In [168]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [169]:
class LDAData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.processed_text
        self.targets = self.data.label
        self.max_len = max_len
        self.perc_dominant_topic = dataframe.Perc_Dominant_Topic
        self.dominant_topic = dataframe.Dominant_Topic.str.extract(r'(?P<Dominant_Topic>\d)').astype(int)

    def get_position_encoding(self, seq_len, d, n=10000):
        P = np.zeros((seq_len, d))
        for k in range(seq_len):
            for i in np.arange(int(d/2)):
                denominator = np.power(n, 2*i/d)
                P[k, 2*i] = np.sin(k/denominator)
                P[k, 2*i+1] = np.cos(k/denominator)
        return P

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        output_seq = self.dominant_topic.iloc[index].values[0] * 10
        position_encoding_for_previous_prediction = self.get_position_encoding(seq_len=output_seq, d=768).sum(axis=0)


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'positional_encoding': torch.tensor(position_encoding_for_previous_prediction, dtype=torch.float)
        }

In [163]:
bin(3).split('b')[1]

'11'

In [170]:
new_df = train[['processed_text', 'label', 'Dominant_Topic', 'Perc_Dominant_Topic']]
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6076 entries, 0 to 6075
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   processed_text       6076 non-null   object 
 1   label                6076 non-null   int64  
 2   Dominant_Topic       6076 non-null   object 
 3   Perc_Dominant_Topic  6076 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 190.0+ KB


In [171]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = LDAData(train_data, tokenizer, MAX_LEN)
testing_set = LDAData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (6076, 4)
TRAIN Dataset: (4861, 4)
TEST Dataset: (1215, 4)


In [172]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 32
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 32
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [264]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask, token_type_ids, lda_encoding):
        output_1 = self.l1(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids,
            )
        hidden_state = output_1[0]
        print(hidden_state[:, 0].shape)
        print(lda_encoding.shape)
        pooler = torch.add(hidden_state[:, 0], lda_encoding[:, 0])
        print(pooler.shape)
        print('add done')
        pooler = self.pre_classifier(pooler)
        print('end classifier')
        pooler = torch.nn.ReLU()(pooler)
        print('end relu')
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        print('end output')
        return output

In [265]:
model = RobertaClass()
model.to(device)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
model = RobertaClass()
model.to('cpu')

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

In [None]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        lda_positional_em = data['positional_encoding'].to(device, dtype = torch.float64)

        outputs = model(ids, mask, token_type_ids, lda_positional_em)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
    train(epoch)