In [None]:
# initialization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gensim.downloader
import re
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

np.random.seed(42)

## Import data

In [None]:
%%script false --no-raise-error # comment this if you dont have teh dev_set

# import dataset
train_set = pd.read_csv('train.csv')

test_set = pd.read_csv('test.csv')

# from train_set sample development set
dev_set = train_set.sample(n=500, replace=False)

# remove dev set from train set
train_set = train_set.drop(dev_set.index)

# check
print(train_set.shape, dev_set.shape, test_set.shape)

# save to dataframe
dev_set.to_csv("dev_set.csv", index=False)
train_set.to_csv("train_set_modified.csv", index=False)

In [None]:
train_set = pd.read_csv('train_set_modified.csv')
dev_set = pd.read_csv('dev_set.csv')
test_set = pd.read_csv('test.csv')

## Preprocess data (averaging over word representations)

TODO: Try max pooling

TODO: Take representation of last word in LSTM

TODO: Use attention and perform weighted average?



In [None]:
# word2vec

# download the word2vec-google-news-300
w2v = gensim.downloader.load('word2vec-google-news-300')

In [None]:
# select four classes: 0, 1, 2, 3
# 4 and 5 will be OTHERS (4)

# for train_set
train_set.loc[train_set['label-coarse'] > 4, 'label-coarse'] = 4

# for dev_set
dev_set.loc[dev_set['label-coarse'] > 4, 'label-coarse'] = 4

# for test_set
test_set.loc[test_set['label-coarse'] > 4, 'label-coarse'] = 4


In [None]:
# neural network transforming the input for each word to its final vector representation
def token(sentence):  
      
    # keep only english words
    sentence = re.sub("[^a-zA-Z]"," ",sentence)
    
    # converting to lower case and splitting

    # stop word removal
    words = sentence.split()
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    filtered_sentence = ' '.join(filtered_words)

    token = word_tokenize(filtered_sentence)
    return token

In [None]:
train_set['cleaned_text'] = train_set['text'].apply(token)

train_set.head()

In [None]:
max_length = train_set['cleaned_text'].str.len().max()

# get the embedding shape of the model
embed_shape = len(w2v['test'])
average_word_embeddings = []

for index, row in train_set.iterrows():

    sentence = row['cleaned_text']

    # get word embedding of each word
    word_embeddings = []

    for word in sentence:
        # check if the word is present in the model
        if word in w2v.key_to_index:
            word_embeddings.append(w2v[word])
        else:
             word_embeddings.append(np.zeros(shape=(embed_shape)))
    
    # perform averaging of word embeddings
    awe = np.mean(word_embeddings, axis = 0)
    average_word_embeddings.append(awe)

train_set['vector'] = average_word_embeddings

train_set.head()


In [None]:
def aggregate_representations(dataset):
  dataset['cleaned_text'] = dataset['text'].apply(token)
  max_length = dataset['cleaned_text'].str.len().max()

  # get the embedding shape of the model
  embed_shape = len(w2v['test'])
  average_word_embeddings = []

  for index, row in dataset.iterrows():

      sentence = row['cleaned_text']

      # get word embedding of each word
      word_embeddings = []

      for word in sentence:
          # check if the word is present in the model
          if word in w2v.key_to_index:
              word_embeddings.append(w2v[word])
          else:
              word_embeddings.append(np.zeros(shape=(embed_shape)))
      
      # perform averaging of word embeddings
      awe = np.mean(word_embeddings, axis = 0)

      average_word_embeddings.append(awe)

  dataset['vector'] = average_word_embeddings
  return dataset

dev_set = aggregate_representations(dev_set)
train_set = aggregate_representations(train_set)

### Prepare data for training

In [None]:
# split into X and y
X_train = train_set['vector']
y_train = train_set['label-coarse']

X_dev = dev_set['vector']
y_dev = dev_set['label-coarse']

# SCALING?

# mini-batch strategy
batch_size = 32

def collate_fn(data):
    x, y = data
    zipped = zip(x, y)
    return list(zipped)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

##  LSTM model

In [None]:
# fed into the softmax classifier to predict the final label
class LSTMModel(nn.Module):
    def __init__(self, input_d, hidden_d, layer_d, output_d):
        super(LSTMModel, self).__init__()
        
        self.hidden_dim = hidden_d
        self.layer_dim = layer_d

        # LSTM model 
        self.lstm = nn.LSTM(input_d, hidden_d, layer_d, batch_first=True) 

        self.head = nn.Sequential(
            nn.Linear(hidden_d, output_d),
            nn.Softmax(dim= 1) # softmax activation for label prediction
        )

    def forward(self, x):
        x = x.reshape(32, 1, 300).float() # convert to 3D tensor
    
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        out = self.head(out[:, -1, :])

        return out
    
input_dim = embed_shape
hidden_dim = 128
output_dim = 5
layer_dim = 1

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# REFERENCE
num_epochs = 100
best_accuracy = 0.0
max_patience = 5
current_patience = 0

for epoch in range(num_epochs):
  model.train()
  total_training_loss = 0.0
  total_dev_loss = 0.0

  for x_batch, y_batch in train_loader:
    outputs = model.forward(x_batch) # forward pass
    optimizer.zero_grad() # calculate the gradient, manually setting to 0
  
    # obtain the loss function
    loss = criterion(outputs, y_batch)
  
    loss.backward() #calculates the loss of the loss function
  
    optimizer.step() #improve from loss, i.e backprop

    total_training_loss += loss.item()

  if epoch % 100 == 0:
    average_loss = total_training_loss / len(train_loader)
    print("Epoch: %d, training set loss: %1.5f" % (epoch, average_loss))

  # evaluate on dev set
  model.eval()
  with torch.no_grad():
    for inputs in dev_loader:
      x_batch, y_batch = inputs

      # forward pass
      y_pred = model.forward(x_batch)
      loss = criterion(y_pred, y_batch)
      loss.backward()
      optimizer.step()

      total_dev_loss += loss.item()
      if epoch % 100 == 0:
        average_dev_loss = total_dev_loss / len(dev_loader)
        print("Epoch: %d, dev set loss: %1.5f" % (epoch, average_dev_loss))

      predicted_labels = torch.argmax(y_pred, dim=1)
      
      all_true_labels = list(y_batch)
      all_predicted_labels = list(predicted_labels)
              
  def calculate_accuracy(y_true, y_pred):
      y_true = y_true.detach().cpu().numpy()
      y_pred = y_pred.detach().cpu().numpy()
      return accuracy_score(y_true, y_pred)

  accuracy = calculate_accuracy(torch.tensor(all_true_labels), torch.tensor(all_predicted_labels))
  print(f"Accuracy: {accuracy * 100:.2f}%")

  if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = model.state_dict()
  else:
    current_patience += 1
    if current_patience >= max_patience:
        print(f"Early stopping after {epoch} epochs.")
        break
