In [1]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm

In [2]:
nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Load your data
df = pd.read_csv('/kaggle/input/assign4-nlp/train.csv')

def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    # Remove punctuations
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Adding start of sequence and end of sequence tokens
    return f'<sos> {text} <eos>'

# Apply preprocessing
df['Processed'] = df['Description'].apply(preprocess_text)
print(df.head())

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
   Class Index                                        Description  \
0            3  Reuters - Short-sellers, Wall Street's dwindli...   
1            3  Reuters - Private investment firm Carlyle Grou...   
2            3  Reuters - Soaring crude prices plus worries\ab...   
3            3  Reuters - Authorities have halted oil export\f...   
4            3  AFP - Tearaway world oil prices, toppling reco...   

                                           Processed  
0  <sos> reuters shortsellers wall streets dwindl...  
1  <sos> reuters private investment firm carlyle ...  
2  <sos> reuters soaring crude prices plus worrie...  
3  <sos> reuters authorities halted oil exportflo...  
4  <sos> afp tearaway world oil prices toppling r...  


In [3]:
df['Tokens'] = df['Processed'].apply(lambda text: text.split())
vocab = set(token for tokens in df['Tokens'] for token in tokens)
token_to_index = {token: idx + 1 for idx, token in enumerate(vocab)}  # +1 for zero padding
token_to_index['<pad>'] = 0  # Padding token
token_to_index['<unk>'] = len(token_to_index)+1

In [4]:
def load_glove_embeddings(path, embedding_dim=100):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_dict[word] = vector
    return embeddings_dict


glove_embeddings = load_glove_embeddings('/kaggle/input/assign4-nlp/glove.6B.100d.txt')

In [5]:
class TextDataset(Dataset):
    def __init__(self, df, glove_embeddings, token_to_index, embedding_dim=100):
        self.sentences = df['Tokens'].tolist()
        self.glove_embeddings = glove_embeddings
        self.token_to_index = token_to_index
        self.embedding_dim = embedding_dim
        self.unk_idx = token_to_index['<unk>']  # Use this for unknown tokens
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        tokens = self.sentences[idx]
        # Convert tokens to indices
        indices = [self.token_to_index.get(token, self.unk_idx) for token in tokens]
        # Create input and target sequences
        input_indices = indices
        target_indices = indices[1:] + [self.token_to_index['<eos>']]  # Shift by one and append <eos>
        
        # Convert indices to embeddings
        input_embeddings = [self.glove_embeddings.get(token, np.zeros(self.embedding_dim)) for token in tokens]
        
        return torch.tensor(input_embeddings, dtype=torch.float32), torch.tensor(target_indices, dtype=torch.long)

    
def collate_batch(batch):
    inputs, targets = zip(*batch)
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=0.0)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=token_to_index['<pad>'])
    return padded_inputs, padded_targets

# Create Dataset and DataLoader instances
dataset = TextDataset(df, glove_embeddings,token_to_index)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True,collate_fn=collate_batch)

In [6]:
class ELMoLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, setting=1):
        super(ELMoLanguageModel, self).__init__()
        self.setting = setting
        self.forward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.forward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm1 = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, batch_first=True)
        self.backward_lstm2 = nn.LSTM(hidden_dim, hidden_dim, num_layers=1, batch_first=True)
        self.forward_pred = nn.Linear(hidden_dim, vocab_size)
        self.backward_pred = nn.Linear(hidden_dim, vocab_size)

        if setting == 1:
            self.gamma = nn.Parameter(torch.ones(3))
        elif setting == 2:
            self.gamma = torch.rand(3, requires_grad=False)
        elif setting == 3:
            self.combiner = nn.Sequential(
                nn.Linear(hidden_dim * 6, hidden_dim * 2),
                nn.ReLU(),
                nn.Linear(hidden_dim * 2, hidden_dim*2)
            )
            self.gamma = nn.Parameter(torch.ones(3))  

    def forward(self, x):
        forward_out1, _ = self.forward_lstm1(x)
        forward_out2, _ = self.forward_lstm2(forward_out1)
        reversed_embeddings = torch.flip(x, [1])
        backward_out1, _ = self.backward_lstm1(reversed_embeddings)
        backward_out2, _ = self.backward_lstm2(backward_out1)

        if self.setting in [1, 2]:
            combined_embeddings = self.gamma[0] * x + self.gamma[1] * torch.cat((forward_out1, backward_out1), dim=-1) + self.gamma[2] * torch.cat((forward_out2, backward_out2), dim=-1)
        elif self.setting == 3:
            combined_embeddings = self.combiner(torch.cat((x, forward_out1, backward_out1, forward_out2, backward_out2), dim=-1))

        forward_predictions = self.forward_pred(forward_out2)
        backward_predictions = self.backward_pred(torch.flip(backward_out2, [1]))
        return forward_predictions, backward_predictions, combined_embeddings

In [7]:
vocab_size = len(token_to_index) 
model_setting_1 = ELMoLanguageModel(vocab_size=vocab_size, embedding_dim=100, hidden_dim=50, setting=1).to("cuda")
model_setting_2 = ELMoLanguageModel(vocab_size=vocab_size, embedding_dim=100, hidden_dim=50, setting=2).to("cuda")
model_setting_3 = ELMoLanguageModel(vocab_size=vocab_size, embedding_dim=100, hidden_dim=50, setting=3).to("cuda")

# First Model

In [8]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model_setting_1.parameters())
criterion = nn.CrossEntropyLoss()


num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0.0
    num_batches = 0
    
    for inputs, targets in tqdm(dataloader):
        inputs, targets = inputs.to("cuda"), targets.to("cuda")
        optimizer.zero_grad()
        
        forward_pred, backward_pred, _ = model_setting_1(inputs) ## adjust model
        
        loss_f = criterion(forward_pred.transpose(1, 2), targets)
        loss_b = criterion(backward_pred.transpose(1, 2), targets)
        loss = loss_f + loss_b
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    average_loss = total_loss / num_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}')


  return torch.tensor(input_embeddings, dtype=torch.float32), torch.tensor(target_indices, dtype=torch.long)
100%|██████████| 7500/7500 [11:38<00:00, 10.73it/s]


Epoch [1/10], Average Loss: 10.2769


100%|██████████| 7500/7500 [11:34<00:00, 10.79it/s]


Epoch [2/10], Average Loss: 8.2431


100%|██████████| 7500/7500 [11:36<00:00, 10.76it/s]


Epoch [3/10], Average Loss: 6.9575


100%|██████████| 7500/7500 [11:46<00:00, 10.61it/s]


Epoch [4/10], Average Loss: 6.1661


100%|██████████| 7500/7500 [11:47<00:00, 10.60it/s]


Epoch [5/10], Average Loss: 5.6903


100%|██████████| 7500/7500 [11:47<00:00, 10.60it/s]


Epoch [6/10], Average Loss: 5.3772


100%|██████████| 7500/7500 [11:45<00:00, 10.63it/s]


Epoch [7/10], Average Loss: 5.1479


100%|██████████| 7500/7500 [11:41<00:00, 10.69it/s]


Epoch [8/10], Average Loss: 4.9785


100%|██████████| 7500/7500 [11:46<00:00, 10.62it/s]


Epoch [9/10], Average Loss: 4.8303


100%|██████████| 7500/7500 [11:45<00:00, 10.63it/s]

Epoch [10/10], Average Loss: 4.7269





In [9]:
import torch
import pickle

# Save the model's state dictionary
torch.save(model_setting_1.state_dict(), 'elmo_model_1_state_dict.pth')

# Save the token_to_index dictionary
with open('token_to_index.pkl', 'wb') as f:
    pickle.dump(token_to_index, f)


# Second Model

In [10]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model_setting_2.parameters())
criterion = nn.CrossEntropyLoss()


num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0.0
    num_batches = 0
    
    for inputs, targets in tqdm(dataloader):
        inputs, targets = inputs.to("cuda"), targets.to("cuda")
        optimizer.zero_grad()
        
        forward_pred, backward_pred, _ = model_setting_2(inputs) ## adjust model
        
        loss_f = criterion(forward_pred.transpose(1, 2), targets)
        loss_b = criterion(backward_pred.transpose(1, 2), targets)
        loss = loss_f + loss_b
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    average_loss = total_loss / num_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}')


100%|██████████| 7500/7500 [11:44<00:00, 10.65it/s]


Epoch [1/10], Average Loss: 10.7086


100%|██████████| 7500/7500 [11:48<00:00, 10.59it/s]


Epoch [2/10], Average Loss: 8.5907


100%|██████████| 7500/7500 [11:40<00:00, 10.71it/s]


Epoch [3/10], Average Loss: 7.3084


100%|██████████| 7500/7500 [11:44<00:00, 10.64it/s]


Epoch [4/10], Average Loss: 6.5910


100%|██████████| 7500/7500 [11:46<00:00, 10.62it/s]


Epoch [5/10], Average Loss: 6.1349


100%|██████████| 7500/7500 [11:44<00:00, 10.65it/s]


Epoch [6/10], Average Loss: 5.7191


100%|██████████| 7500/7500 [11:41<00:00, 10.69it/s]


Epoch [7/10], Average Loss: 5.4156


100%|██████████| 7500/7500 [11:42<00:00, 10.68it/s]


Epoch [8/10], Average Loss: 5.2099


100%|██████████| 7500/7500 [11:48<00:00, 10.58it/s]


Epoch [9/10], Average Loss: 5.0348


100%|██████████| 7500/7500 [11:45<00:00, 10.64it/s]

Epoch [10/10], Average Loss: 4.9027





In [11]:
import torch
import pickle

# Save the model's state dictionary
torch.save(model_setting_2.state_dict(), 'elmo_model_2_state_dict.pth')

# Third Model

In [12]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model_setting_3.parameters())
criterion = nn.CrossEntropyLoss()


num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0.0
    num_batches = 0
    
    for inputs, targets in tqdm(dataloader):
        inputs, targets = inputs.to("cuda"), targets.to("cuda")
        optimizer.zero_grad()
        
        forward_pred, backward_pred, _ = model_setting_3(inputs) ## adjust model
        
        loss_f = criterion(forward_pred.transpose(1, 2), targets)
        loss_b = criterion(backward_pred.transpose(1, 2), targets)
        loss = loss_f + loss_b
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    average_loss = total_loss / num_batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}')


100%|██████████| 7500/7500 [11:49<00:00, 10.57it/s]


Epoch [1/10], Average Loss: 10.6031


100%|██████████| 7500/7500 [11:50<00:00, 10.56it/s]


Epoch [2/10], Average Loss: 8.4765


100%|██████████| 7500/7500 [11:48<00:00, 10.58it/s]


Epoch [3/10], Average Loss: 7.5384


100%|██████████| 7500/7500 [11:48<00:00, 10.58it/s]


Epoch [4/10], Average Loss: 6.7869


100%|██████████| 7500/7500 [11:47<00:00, 10.60it/s]


Epoch [5/10], Average Loss: 6.2741


100%|██████████| 7500/7500 [11:41<00:00, 10.69it/s]


Epoch [6/10], Average Loss: 5.9229


100%|██████████| 7500/7500 [11:40<00:00, 10.71it/s]


Epoch [7/10], Average Loss: 5.5972


100%|██████████| 7500/7500 [11:41<00:00, 10.69it/s]


Epoch [8/10], Average Loss: 5.3371


100%|██████████| 7500/7500 [11:37<00:00, 10.75it/s]


Epoch [9/10], Average Loss: 5.1263


100%|██████████| 7500/7500 [11:31<00:00, 10.84it/s]

Epoch [10/10], Average Loss: 4.9684





In [13]:
import torch
import pickle

# Save the model's state dictionary
torch.save(model_setting_3.state_dict(), 'elmo_model_3_state_dict.pth')