<a href="https://colab.research.google.com/github/ganeshred/NLPAuthorshipAttributionNLM/blob/main/POS_tags_Task3_bi_LSTM_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('/content/gdrive/MyDrive/Task3.csv')

In [5]:
df.head()

Unnamed: 0,Generation,label,label_int
0,Flint man facing 10 dog fighting and drug deal...,grover,5
1,the 30 most troubling lines from donald trump ...,ctrl,0
2,Wakayama rafting is a type of traditional raft...,instructgpt,7
3,The movies that predicted an outbreak like cor...,grover,5
4,companies that pay taxes are allowed to use ta...,pplm,8


In [6]:
import nltk
from nltk.tokenize import word_tokenize

# Example texts
texts = df['Generation']

# Example labels
labels = df['label']



In [7]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
import nltk
from nltk.tokenize import word_tokenize

pos_tagged_texts = []
for i in range(len(texts)):
    text = texts[i]
    label = labels[i]
    if text is not None and not isinstance(text, float): # make sure text is not None or NaN
      tokens = word_tokenize(text)
      pos_tags = nltk.pos_tag(tokens)
      pos_tagged_text = [(word, pos) for (word, pos) in pos_tags]
      pos_tagged_texts.append((pos_tagged_text, label))


In [9]:
pos_tagged_texts[0]

([('Flint', 'NNP'),
  ('man', 'NN'),
  ('facing', 'VBG'),
  ('10', 'CD'),
  ('dog', 'NN'),
  ('fighting', 'NN'),
  ('and', 'CC'),
  ('drug', 'NN'),
  ('dealing', 'NN'),
  ('charges', 'NNS'),
  ('flint', 'VBP'),
  ('mi', 'VB'),
  ('a', 'DT'),
  ('flint', 'NN'),
  ('man', 'NN'),
  ('has', 'VBZ'),
  ('been', 'VBN'),
  ('arrested', 'VBN'),
  ('and', 'CC'),
  ('charged', 'VBN'),
  ('with', 'IN'),
  ('possession', 'NN'),
  ('of', 'IN'),
  ('cocaine', 'JJ'),
  ('drug', 'NN'),
  ('dealing', 'NN'),
  ('and', 'CC'),
  ('possession', 'NN'),
  ('of', 'IN'),
  ('methamphetamine', 'NN'),
  ('in', 'IN'),
  ('connection', 'NN'),
  ('with', 'IN'),
  ('a', 'DT'),
  ('drug', 'NN'),
  ('den', 'NN'),
  ('where', 'WRB'),
  ('he', 'PRP'),
  ('allegedly', 'RB'),
  ('shot', 'VBD'),
  ('one', 'CD'),
  ('or', 'CC'),
  ('more', 'JJR'),
  ('dogs', 'NNS'),
  ('authorities', 'NNS'),
  ('said', 'VBD'),
  ('daniel', 'JJ'),
  ('holtz', 'NN'),
  ('47', 'CD'),
  ('appeared', 'VBD'),
  ('in', 'IN'),
  ('the', 'DT'),
  ('f

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader




class POSTaggedTextDataset(Dataset):
    def __init__(self, pos_tagged_texts, pos_to_index, label_to_index):
        self.pos_tagged_texts = pos_tagged_texts
        self.pos_to_index = pos_to_index
        self.label_to_index = label_to_index
        
    def __len__(self):
        return len(self.pos_tagged_texts)
    
    def __getitem__(self, index):
        text, label = self.pos_tagged_texts[index]
        pos_indices = [self.pos_to_index[pos] for word, pos in text]
        label_index = self.label_to_index[label]
        return torch.LongTensor(pos_indices), torch.LongTensor([label_index])


In [11]:
pos_to_index = {pos: i for i, pos in enumerate(set([pos for text, label in pos_tagged_texts for word, pos in text]))}
label_to_index = {label: i for i, label in enumerate(set([label for text, label in pos_tagged_texts]))}

In [14]:
pos_to_index

{'DT': 0,
 'WP$': 1,
 'RB': 2,
 ':': 3,
 '``': 4,
 'VBG': 5,
 'PRP': 6,
 ',': 7,
 'VBP': 8,
 'TO': 9,
 'EX': 10,
 'RBR': 11,
 'WRB': 12,
 'VBD': 13,
 '(': 14,
 'RP': 15,
 'WDT': 16,
 '$': 17,
 'VBZ': 18,
 'NNP': 19,
 'FW': 20,
 'POS': 21,
 'NN': 22,
 'MD': 23,
 'CC': 24,
 'JJR': 25,
 'SYM': 26,
 'WP': 27,
 'RBS': 28,
 'CD': 29,
 'NNPS': 30,
 'VBN': 31,
 ')': 32,
 'PRP$': 33,
 'IN': 34,
 'PDT': 35,
 'UH': 36,
 '.': 37,
 'VB': 38,
 "''": 39,
 'JJ': 40,
 '#': 41,
 'NNS': 42,
 'JJS': 43}

In [15]:
label_to_index

{'xlm': 0,
 'grover': 1,
 'fair': 2,
 'human': 3,
 'pplm': 4,
 'gpt': 5,
 'ctrl': 6,
 'xlnet': 7,
 'gpt3': 8,
 'instructgpt': 9,
 'gpt2': 10}

In [16]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    pos_indices = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]
    pos_indices_padded = pad_sequence(pos_indices, batch_first=True, padding_value=0)
    labels = torch.tensor([sample[1] for sample in batch], dtype=torch.long)
    return pos_indices_padded, labels


In [17]:
dataset = POSTaggedTextDataset(pos_tagged_texts, pos_to_index, label_to_index)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [18]:
# class POSBiLSTM(nn.Module):
#     def __init__(self, num_pos_tags, num_labels, hidden_size, num_layers):
#         super(POSBiLSTM, self).__init__()
#         self.embedding = nn.Embedding(num_pos_tags, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, bidirectional=True)
#         self.fc = nn.Linear(hidden_size*2, num_labels)
        
#     def forward(self, pos_indices):
#         x = self.embedding(pos_indices).permute(1, 0, 2) # shape: (seq_len, batch_size, hidden_size)

#         # h_n = h_n.permute(1, 0, 2)
#         # h_n = torch.cat([h_n[-1,:,:self.lstm.hidden_size], h_n[0,:,self.lstm.hidden_size:]], dim=-1) # shape: (batch_size, hidden_size*2)

#         _, (h_n, _) = self.lstm(x) # h_n has shape (num_layers * num_directions, batch_size, hidden_size)
#         h_n = h_n.permute(1, 0, 2).reshape(-1, self.lstm.hidden_size*2) # shape: (batch_size, hidden_size*2)
#         print(h_n.size())
#         out = self.fc(h_n) # shape: (batch_size, num_labels)
#         print(out.size())
#         return out


class POSBiLSTM(nn.Module):
    def __init__(self, num_pos_tags, num_labels, hidden_size, num_layers):
        super(POSBiLSTM, self).__init__()
        self.embedding = nn.Embedding(num_pos_tags, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_labels)
        
    def forward(self, pos_indices):
        x = self.embedding(pos_indices).permute(1, 0, 2) # shape: (seq_len, batch_size, hidden_size)
        lstm_out, _ = self.lstm(x) # lstm_out has shape (seq_len, batch_size, hidden_size*num_directions)
        last_hidden_state = self.get_last_hidden_state(lstm_out)
        out = self.fc(last_hidden_state) # shape: (batch_size, num_labels)
        return out
    
    def get_last_hidden_state(self, lstm_out):
        # Extract the last hidden state of the LSTM
        last_hidden_state_fw = lstm_out[-1, :, :self.lstm.hidden_size]
        last_hidden_state_bw = lstm_out[0, :, self.lstm.hidden_size:]
        last_hidden_state = torch.cat((last_hidden_state_fw, last_hidden_state_bw), dim=-1)
        return last_hidden_state



In [19]:
num_pos_tags = len(pos_to_index)
num_labels = len(label_to_index)
hidden_size = 128
num_layers = 2
model = POSBiLSTM(num_pos_tags, num_labels, hidden_size, num_layers)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())



In [20]:
# Train function
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    for pos_indices, labels in dataloader:
        pos_indices, labels = pos_indices.to(device), labels.to(device)
        # print(pos_indices.size())
        optimizer.zero_grad()
        outputs = model(pos_indices)
        # print(outputs.size())
        # print(labels.size())
        loss = criterion(outputs, labels.squeeze())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataloader)

# Evaluate function
def evaluate(model, dataloader, device):
    model.eval()
    num_correct = 0
    num_total = 0
    with torch.no_grad():
        for pos_indices, labels in dataloader:
            pos_indices, labels = pos_indices.to(device), labels.to(device)
            outputs = model(pos_indices)
            _, predicted = torch.max(outputs.data, 1)
            num_correct += (predicted == labels.squeeze()).sum().item()
            num_total += pos_indices.size(0)
    return num_correct / num_total


In [21]:
# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create data loaders for training and validation sets
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,collate_fn=collate_fn)


In [22]:
# Train the model
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, device)
    val_acc = evaluate(model, val_dataloader, device)
    print("Epoch {}/{} - Train Loss: {:.4f} - Val Acc: {:.4f}".format(epoch+1, num_epochs, train_loss, val_acc))


  pos_indices = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]


Epoch 1/10 - Train Loss: 1.7442 - Val Acc: 0.4983
Epoch 2/10 - Train Loss: 1.2122 - Val Acc: 0.6037
Epoch 3/10 - Train Loss: 0.8274 - Val Acc: 0.6779
Epoch 4/10 - Train Loss: 0.6817 - Val Acc: 0.6852
Epoch 5/10 - Train Loss: 0.6387 - Val Acc: 0.7206
Epoch 6/10 - Train Loss: 0.5683 - Val Acc: 0.7419
Epoch 7/10 - Train Loss: 0.5479 - Val Acc: 0.7568
Epoch 8/10 - Train Loss: 0.5361 - Val Acc: 0.7543
Epoch 9/10 - Train Loss: 0.4986 - Val Acc: 0.7769
Epoch 10/10 - Train Loss: 0.4707 - Val Acc: 0.7880


In [23]:
for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, criterion, device)
    val_acc = evaluate(model, val_dataloader, device)
    print("Epoch {}/{} - Train Loss: {:.4f} - Val Acc: {:.4f}".format(epoch+1, num_epochs, train_loss, val_acc))


  pos_indices = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]


Epoch 1/10 - Train Loss: 0.4450 - Val Acc: 0.7995
Epoch 2/10 - Train Loss: 0.4394 - Val Acc: 0.7824
Epoch 3/10 - Train Loss: 0.4376 - Val Acc: 0.7944
Epoch 4/10 - Train Loss: 0.4008 - Val Acc: 0.8093
Epoch 5/10 - Train Loss: 0.3957 - Val Acc: 0.8003
Epoch 6/10 - Train Loss: 0.3684 - Val Acc: 0.8110
Epoch 7/10 - Train Loss: 0.3549 - Val Acc: 0.8242
Epoch 8/10 - Train Loss: 0.3441 - Val Acc: 0.7747
Epoch 9/10 - Train Loss: 0.3373 - Val Acc: 0.8102
Epoch 10/10 - Train Loss: 0.3120 - Val Acc: 0.8195


In [24]:
train_acc = evaluate(model, train_dataloader, device)

  pos_indices = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]


In [25]:
torch.save(model.state_dict(), "/content/gdrive/MyDrive/posmodeltask3.pt")


In [26]:
train_acc

0.898335822487732