<a href="https://colab.research.google.com/github/ganeshred/NLPAuthorshipAttributionNLM/blob/main/bi_LSTM_task3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import torch
from torchtext import data
from torchtext import datasets

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',tokenizer_language='en_core_web_sm', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('/content/Task3.csv')

In [6]:
len(df)

11726

In [7]:
df.head()

Unnamed: 0,Generation,label,label_int
0,Flint man facing 10 dog fighting and drug deal...,grover,5
1,the 30 most troubling lines from donald trump ...,ctrl,0
2,Wakayama rafting is a type of traditional raft...,instructgpt,7
3,The movies that predicted an outbreak like cor...,grover,5
4,companies that pay taxes are allowed to use ta...,pplm,8


In [8]:
df[['text']] = df[['Generation']]. astype(str)

In [9]:
df = df[['text','label']]

In [10]:
# text_field = data.Field(
#     sequential=True,
#     tokenize='basic_english', 
#     fix_length=500,
#     lower=True
# )
TEXT = data.Field(tokenize = 'spacy', tokenizer_language='en_core_web_sm',  include_lengths = True)
LABEL = data.LabelField(sequential=False, use_vocab=False, dtype = torch.float)

# label_field = data.Field(sequential=False, use_vocab=False)

# sadly have to apply preprocess manually
preprocessed_text = df['text'].apply(
    lambda x: TEXT.preprocess(x)
)

# load fastext simple embedding with 300d
TEXT.build_vocab(
    preprocessed_text, 
    max_size = 30000,
    vectors='glove.6B.50d',
    unk_init = torch.Tensor.normal_
)

# get the vocab instance
vocab = TEXT.vocab

In [11]:
ltoi = {l: i for i, l in enumerate(df['label'].unique())}

In [12]:
df['label1'] = df['label'].apply(lambda y: ltoi[y])

class DataFrameDataset(data.Dataset):
    def __init__(self, df: pd.DataFrame, fields: list):
        super(DataFrameDataset, self).__init__(
            [
                data.Example.fromlist(list(r), fields) 
                for i, r in df.iterrows()
            ], 
            fields
        )

In [13]:
df1=df[['text','label1']]

In [14]:
examples=[]
for i,row in df1.iterrows():
  # print(example)
  examples.append(data.Example.fromlist(row, fields=[('text', TEXT), ('label', LABEL)]))

In [15]:
# examples[2000].label
c=0
for i in examples:
  if type(i.label) != int:
    c+=1

In [16]:
c

0

In [17]:
dataset = data.Dataset(examples, fields=[('text', TEXT), ('label', LABEL)])

In [18]:
# dataset = data.Dataset(examples, fields=[('text', TEXT), ('label', LABEL)])
# train_dataset, test_dataset, valid_dataset = dataset.split(split_ratio=[0.65, 0.15, 0.2], stratified=True, strata_field='label')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print("We are working with ", device)
# # device = 'cpu'
# train_iter, test_iter, valid_iter = data.BucketIterator.splits(
#     datasets=(train_dataset, test_dataset, valid_dataset), 
#     batch_size=32,
#     sort_within_batch = True,
#     sort_key=lambda x: len(x.text),
#     device=device
# )

In [19]:
train_dataset, test_dataset, valid_dataset = dataset.split(split_ratio=[0.65, 0.15, 0.2], stratified=True, strata_field='label')

In [20]:
len(train_dataset)

7623

In [21]:
ltoi

{'grover': 0,
 'ctrl': 1,
 'instructgpt': 2,
 'pplm': 3,
 'gpt2': 4,
 'xlnet': 5,
 'gpt': 6,
 'xlm': 7,
 'human': 8,
 'gpt3': 9,
 'fair': 10}

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("We are working with ", device)
# device = 'cpu'
train_iter, test_iter, valid_iter = data.BucketIterator.splits(
    datasets=(train_dataset, test_dataset, valid_dataset), 
    batch_size=16,
    sort_within_batch = True,
    sort_key=lambda x: len(x.text),
    device=device
)

We are working with  cuda


In [23]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        #1. Initialize Embedding Layer
        self.embedding = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim, padding_idx = pad_idx)
        #2. Initialize LSTM layer
        self.lstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, num_layers = n_layers, dropout = dropout, bidirectional = bidirectional)
        #3. Initialize a fully connected layer with Linear transformation
        self.fc = nn.Linear(hidden_dim * n_layers, output_dim)
        #4. Initialize Dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        #1. Apply embedding layer that matches each word to its vector and apply dropout. Dim [sent_len, batch_size, emb_dim]
        embedding = self.embedding(text)
        embedding_dropout = self.dropout(embedding)
        #2. Run the LSTM along the sentences of length sent_len. #output = [sent len, batch size, hid dim * num directions]; #hidden = [num layers * num directions, batch size, hid dim]
        output, (hidden,cell) = self.lstm(embedding_dropout)
        #3. Concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers and apply dropout
        concat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        hidden = self.dropout(concat)     
        return self.fc(hidden)

In [24]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 126
OUTPUT_DIM = 11
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT,
            PAD_IDX)

In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,065,347 trainable parameters


In [26]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([30002, 50])


In [27]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  0.7372,  0.3910,  0.5158],
        [ 0.1825, -0.0737,  0.3147,  ...,  1.3226, -2.6796, -0.1297],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [-0.9581,  0.5661,  0.2489,  ..., -0.4387, -0.5074,  1.0205],
        [-0.1401, -0.3614, -0.8919,  ...,  0.4649,  0.3368,  0.6853],
        [ 0.1328, -0.4302,  0.0057,  ..., -1.4743,  1.7457,  0.2557]])

In [28]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [-0.9581,  0.5661,  0.2489,  ..., -0.4387, -0.5074,  1.0205],
        [-0.1401, -0.3614, -0.8919,  ...,  0.4649,  0.3368,  0.6853],
        [ 0.1328, -0.4302,  0.0057,  ..., -1.4743,  1.7457,  0.2557]])


In [29]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [30]:
criterion = nn.CrossEntropyLoss()

# device='cpu'
model = model.to(device)
criterion = criterion.to(device)

In [31]:
def multiclass_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    # get the index of the highest prediction for each instance
    _, preds = torch.max(preds, dim=1)
    correct = (preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


In [32]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        # print(batch.text)
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)

        target = batch.label
        target = target.type(torch.LongTensor)

        # print(target)

        # print(predictions)
        
        loss = criterion(predictions.to(device), target.to(device))
        
        acc = multiclass_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            
            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            target = batch.label
            target = target.type(torch.LongTensor)
            
            loss = criterion(predictions.to(device), target.to(device))
            
            acc = multiclass_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()



    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [34]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
N_EPOCHS = 15
path = F"/content/gdrive/My Drive/bilstm_model_task3.pt"
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), path)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 12s
	Train Loss: 1.910 | Train Acc: 27.83%
	 Val. Loss: 1.445 |  Val. Acc: 42.78%
Epoch: 02 | Epoch Time: 0m 12s
	Train Loss: 1.373 | Train Acc: 45.11%
	 Val. Loss: 1.634 |  Val. Acc: 43.07%
Epoch: 03 | Epoch Time: 0m 12s
	Train Loss: 1.174 | Train Acc: 53.11%
	 Val. Loss: 1.010 |  Val. Acc: 59.83%
Epoch: 04 | Epoch Time: 0m 12s
	Train Loss: 1.024 | Train Acc: 59.15%
	 Val. Loss: 0.861 |  Val. Acc: 65.62%
Epoch: 05 | Epoch Time: 0m 12s
	Train Loss: 0.804 | Train Acc: 67.22%
	 Val. Loss: 0.638 |  Val. Acc: 73.35%
Epoch: 06 | Epoch Time: 0m 12s
	Train Loss: 0.763 | Train Acc: 68.50%
	 Val. Loss: 0.795 |  Val. Acc: 66.70%
Epoch: 07 | Epoch Time: 0m 12s
	Train Loss: 0.675 | Train Acc: 71.17%
	 Val. Loss: 0.593 |  Val. Acc: 74.89%
Epoch: 08 | Epoch Time: 0m 12s
	Train Loss: 0.587 | Train Acc: 74.48%
	 Val. Loss: 0.543 |  Val. Acc: 75.85%
Epoch: 09 | Epoch Time: 0m 12s
	Train Loss: 0.543 | Train Acc: 76.27%
	 Val. Loss: 0.525 |  Val. Acc: 76.82%
Epoch: 10 | Epoch T

In [36]:
test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.493 | Test Acc: 78.83%


In [37]:
import pandas as pd


In [38]:
df_reddit = pd.read_csv('/content/gdrive/MyDrive/NLP/reddit_data_gpt3.csv')

FileNotFoundError: ignored

In [None]:
df_reddit.head()

In [None]:
df_reddit['text'] = df_reddit['Title']+df_reddit['Body']+df_reddit['Generated Text']

In [None]:
df_reddit = df_reddit [['text']]
df_reddit['label']=9

In [None]:
ltoi

In [None]:
df_reddit.head()

In [None]:
examples=[]
for i,row in df_reddit.iterrows():
  # print(example)
  examples.append(data.Example.fromlist(row, fields=[('text', TEXT), ('label', LABEL)]))

In [None]:
# examples

In [None]:
examples=[]
for i,row in df_reddit.iterrows():
  # print(example)
  examples.append(data.Example.fromlist(row, fields=[('text', TEXT), ('label', LABEL)]))

dataset = data.Dataset(examples, fields=[('text', TEXT), ('label', LABEL)])
train_dataset, test_dataset, valid_dataset = dataset.split(split_ratio=[0.65, 0.15, 0.2], stratified=True, strata_field='label')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device = 'cpu'
print("We are working with ", device)
train_iter, test_iter, valid_iter = data.BucketIterator.splits(
    datasets=(train_dataset, test_dataset, valid_dataset), 
    batch_size=32,
    sort_within_batch = True,
    sort_key=lambda x: len(x.text),
    device=device
)

In [None]:
len(valid_iter)

In [None]:
def evaluate_reddit(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:   
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            target = batch.label
            target = target.type(torch.LongTensor)
            
            loss = criterion(predictions.to(device), target.to(device))
            
            acc = multiclass_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()



    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
model.load_state_dict(torch.load(path))


In [None]:
loss,acc = evaluate_reddit(model,train_iter,criterion)

In [None]:
acc*100

In [None]:
loss,acc = evaluate_reddit(model,test_iter,criterion)

In [None]:
acc*100

In [39]:
def evaluate(model, dataloader):
    model.eval()
    device = next(model.parameters()).device
    running_loss = 0.0
    correct_predictions = 0
    class_correct = [0] * 11
    class_total = [0] * 11

    with torch.no_grad():
        for batch in dataloader:   
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            target = batch.label
            target = target.type(torch.LongTensor)
            
            loss = criterion(predictions.to(device), target.to(device))
            
            # acc = multiclass_accuracy(predictions, batch.label)
            _, predicted = torch.max(predictions, dim=1)
            correct_predictions += torch.sum(predicted.to(device) == target.to(device)).item()

            for i in range(len(target)):
                label = target[i]
                class_correct[label] += int(predicted[i] == label)
                class_total[label] += 1

            # epoch_loss += loss.item()
    # epoch_loss = running_loss / len(dataloader)
    # epoch_acc = correct_predictions / len(dataloader)
    classwise_acc = [class_correct[i] / class_total[i] if class_total[i] > 0 else 0 for i in range(11)]
    return classwise_acc

In [40]:
evaluate(model,train_iter)

[0.9494949494949495,
 1.0,
 0.7575757575757576,
 0.6782106782106783,
 0.481962481962482,
 0.9653679653679653,
 0.9581529581529582,
 1.0,
 0.9942279942279942,
 0.5627705627705628,
 0.873015873015873]

In [41]:
evaluate(model,test_iter)

[0.9107981220657277,
 0.9859154929577465,
 0.6009389671361502,
 0.5258215962441315,
 0.45539906103286387,
 0.9530516431924883,
 0.9577464788732394,
 0.9859154929577465,
 0.9906103286384976,
 0.5117370892018779,
 0.784037558685446]

In [42]:
ltoi

{'grover': 0,
 'ctrl': 1,
 'instructgpt': 2,
 'pplm': 3,
 'gpt2': 4,
 'xlnet': 5,
 'gpt': 6,
 'xlm': 7,
 'human': 8,
 'gpt3': 9,
 'fair': 10}