<a href="https://colab.research.google.com/github/ganeshred/NLPAuthorshipAttributionNLM/blob/main/POS_tags_Task2_bi_LSTM_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.1
    Uninstalling torchtext-0.15.1:
      Successfully uninstalled torchtext-0.15.1
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('/content/gdrive/MyDrive/balanced_task2.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,664,Netanyahu barely scratches out a victory in Is...,0
1,61,"In ""Hunters,"" the premiere episode of a new se...",0
2,518,. cricket ice cream is a luxury ice cream made...,0
3,248,Graciela Iturbide was born in Mexico City in 1...,0
4,435,A new study from the University of Rochester d...,0


In [6]:
import nltk
from nltk.tokenize import word_tokenize

# Example texts
texts = df['text']

# Example labels
labels = df['class']



In [7]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
import nltk
from nltk.tokenize import word_tokenize

pos_tagged_texts = []
for i in range(len(texts)):
    text = texts[i]
    label = labels[i]
    if text is not None and not isinstance(text, float): # make sure text is not None or NaN
      tokens = word_tokenize(text)
      pos_tags = nltk.pos_tag(tokens)
      pos_tagged_text = [(word, pos) for (word, pos) in pos_tags]
      pos_tagged_texts.append((pos_tagged_text, label))


In [9]:
pos_tagged_texts[0]

([('Netanyahu', 'NNP'),
  ('barely', 'RB'),
  ('scratches', 'VBZ'),
  ('out', 'RP'),
  ('a', 'DT'),
  ('victory', 'NN'),
  ('in', 'IN'),
  ('Israeli', 'NNP'),
  ('elections', 'NNS')],
 0)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader




class POSTaggedTextDataset(Dataset):
    def __init__(self, pos_tagged_texts, pos_to_index, label_to_index):
        self.pos_tagged_texts = pos_tagged_texts
        self.pos_to_index = pos_to_index
        self.label_to_index = label_to_index
        
    def __len__(self):
        return len(self.pos_tagged_texts)
    
    def __getitem__(self, index):
        text, label = self.pos_tagged_texts[index]
        pos_indices = [self.pos_to_index[pos] for word, pos in text]
        label_index = self.label_to_index[label]
        return torch.LongTensor(pos_indices), torch.LongTensor([label_index])


In [11]:
pos_to_index = {pos: i for i, pos in enumerate(set([pos for text, label in pos_tagged_texts for word, pos in text]))}
label_to_index = {label: i for i, label in enumerate(set([label for text, label in pos_tagged_texts]))}

In [12]:
pos_to_index

{'RBS': 0,
 'VBN': 1,
 'VBG': 2,
 'JJR': 3,
 ':': 4,
 'JJS': 5,
 'TO': 6,
 'RBR': 7,
 'VBD': 8,
 'MD': 9,
 ',': 10,
 '(': 11,
 'DT': 12,
 'NNPS': 13,
 'NN': 14,
 'VB': 15,
 '#': 16,
 ')': 17,
 'PRP$': 18,
 'WP': 19,
 'CC': 20,
 'SYM': 21,
 'EX': 22,
 'WDT': 23,
 'RP': 24,
 'WRB': 25,
 'VBZ': 26,
 'NNS': 27,
 'PRP': 28,
 'IN': 29,
 'VBP': 30,
 "''": 31,
 'JJ': 32,
 'POS': 33,
 'FW': 34,
 'PDT': 35,
 '.': 36,
 'RB': 37,
 'CD': 38,
 'UH': 39,
 'WP$': 40,
 '$': 41,
 '``': 42,
 'NNP': 43}

In [13]:
label_to_index

{0: 0, 1: 1}

In [14]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    pos_indices = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]
    pos_indices_padded = pad_sequence(pos_indices, batch_first=True, padding_value=0)
    labels = torch.tensor([sample[1] for sample in batch], dtype=torch.long)
    return pos_indices_padded, labels


In [15]:
dataset = POSTaggedTextDataset(pos_tagged_texts, pos_to_index, label_to_index)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [16]:
# class POSBiLSTM(nn.Module):
#     def __init__(self, num_pos_tags, num_labels, hidden_size, num_layers):
#         super(POSBiLSTM, self).__init__()
#         self.embedding = nn.Embedding(num_pos_tags, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, bidirectional=True)
#         self.fc = nn.Linear(hidden_size*2, num_labels)
        
#     def forward(self, pos_indices):
#         x = self.embedding(pos_indices).permute(1, 0, 2) # shape: (seq_len, batch_size, hidden_size)

#         # h_n = h_n.permute(1, 0, 2)
#         # h_n = torch.cat([h_n[-1,:,:self.lstm.hidden_size], h_n[0,:,self.lstm.hidden_size:]], dim=-1) # shape: (batch_size, hidden_size*2)

#         _, (h_n, _) = self.lstm(x) # h_n has shape (num_layers * num_directions, batch_size, hidden_size)
#         h_n = h_n.permute(1, 0, 2).reshape(-1, self.lstm.hidden_size*2) # shape: (batch_size, hidden_size*2)
#         print(h_n.size())
#         out = self.fc(h_n) # shape: (batch_size, num_labels)
#         print(out.size())
#         return out


class POSBiLSTM(nn.Module):
    def __init__(self, num_pos_tags, num_labels, hidden_size, num_layers):
        super(POSBiLSTM, self).__init__()
        self.embedding = nn.Embedding(num_pos_tags, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_labels)
        
    def forward(self, pos_indices):
        x = self.embedding(pos_indices).permute(1, 0, 2) # shape: (seq_len, batch_size, hidden_size)
        lstm_out, _ = self.lstm(x) # lstm_out has shape (seq_len, batch_size, hidden_size*num_directions)
        last_hidden_state = self.get_last_hidden_state(lstm_out)
        out = self.fc(last_hidden_state) # shape: (batch_size, num_labels)
        return out
    
    def get_last_hidden_state(self, lstm_out):
        # Extract the last hidden state of the LSTM
        last_hidden_state_fw = lstm_out[-1, :, :self.lstm.hidden_size]
        last_hidden_state_bw = lstm_out[0, :, self.lstm.hidden_size:]
        last_hidden_state = torch.cat((last_hidden_state_fw, last_hidden_state_bw), dim=-1)
        return last_hidden_state



In [17]:
num_pos_tags = len(pos_to_index)
num_labels = len(label_to_index)
hidden_size = 128
num_layers = 2
model = POSBiLSTM(num_pos_tags, num_labels, hidden_size, num_layers)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())



In [18]:
# Train function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for pos_indices, labels in dataloader:
        pos_indices, labels = pos_indices.to(device), labels.to(device)
        # print(pos_indices.size())
        optimizer.zero_grad()
        outputs = model(pos_indices)
        # print(outputs.size())
        # print(labels.size())
        loss = criterion(outputs, labels.squeeze())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

# Evaluate function
def evaluate(model, dataloader, device):
    model.eval()
    num_correct = 0
    num_total = 0
    with torch.no_grad():
        for pos_indices, labels in dataloader:
            pos_indices, labels = pos_indices.to(device), labels.to(device)
            outputs = model(pos_indices)
            _, predicted = torch.max(outputs.data, 1)
            num_correct += (predicted == labels.squeeze()).sum().item()
            num_total += pos_indices.size(0)
    return num_correct / num_total


In [19]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders for training and validation sets
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,collate_fn=collate_fn)


In [20]:
# Train the model
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, device)
    val_acc = evaluate(model, val_dataloader, device)
    print("Epoch {}/{} - Train Loss: {:.4f} - Val Acc: {:.4f}".format(epoch+1, num_epochs, train_loss, val_acc))


  pos_indices = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]


Epoch 1/10 - Train Loss: 0.5727 - Val Acc: 0.8056
Epoch 2/10 - Train Loss: 0.4555 - Val Acc: 0.8080
Epoch 3/10 - Train Loss: 0.3853 - Val Acc: 0.8525
Epoch 4/10 - Train Loss: 0.3333 - Val Acc: 0.8642
Epoch 5/10 - Train Loss: 0.2870 - Val Acc: 0.8806
Epoch 6/10 - Train Loss: 0.2539 - Val Acc: 0.8852
Epoch 7/10 - Train Loss: 0.2176 - Val Acc: 0.8595
Epoch 8/10 - Train Loss: 0.1882 - Val Acc: 0.8642
Epoch 9/10 - Train Loss: 0.1665 - Val Acc: 0.8571
Epoch 10/10 - Train Loss: 0.1734 - Val Acc: 0.8525
