In [1]:
# initialization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gensim.downloader
import re
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')

np.random.seed(42)

[nltk_data] Downloading package punkt to /Users/fionchai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fionchai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import data

In [2]:
%%script false --no-raise-error # comment this if you dont have the dev_set

# import dataset
train_set = pd.read_csv('train.csv')

test_set = pd.read_csv('test.csv')

# from train_set sample development set
dev_set = train_set.sample(n=500, replace=False)

# remove dev set from train set
train_set = train_set.drop(dev_set.index)

# check
print(train_set.shape, dev_set.shape, test_set.shape)

# save to dataframe
dev_set.to_csv("dev_set.csv", index=False)
train_set.to_csv("train_set_modified.csv", index=False)

In [3]:
train_set = pd.read_csv('train_set_modified.csv')
dev_set = pd.read_csv('dev_set.csv')
test_set = pd.read_csv('test.csv')

## Preprocess data (averaging over word representations)

TODO: Try max pooling

TODO: Take representation of last word in LSTM

TODO: Use attention and perform weighted average?



In [4]:
# word2vec

# download the word2vec-google-news-300
w2v = gensim.downloader.load('word2vec-google-news-300')

In [5]:
# select four classes: 0, 1, 2, 3
# 4 and 5 will be OTHERS (4)

# for train_set
train_set.loc[train_set['label-coarse'] > 4, 'label-coarse'] = 4

# for dev_set
dev_set.loc[dev_set['label-coarse'] > 4, 'label-coarse'] = 4

# for test_set
test_set.loc[test_set['label-coarse'] > 4, 'label-coarse'] = 4


In [6]:
# neural network transforming the input for each word to its final vector representation
def token(sentence):

    # keep only english words
    sentence = re.sub("[^a-zA-Z]"," ",sentence)

    # converting to lower case and splitting

    # stop word removal
    words = sentence.split()
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    filtered_sentence = ' '.join(filtered_words)

    token = word_tokenize(filtered_sentence)
    return token

In [7]:
train_set['cleaned_text'] = train_set['text'].apply(token)

train_set.head()

Unnamed: 0,label-coarse,label-fine,text,cleaned_text
0,0,0,How did serfdom develop in and then leave Russ...,"[serfdom, develop, leave, russia]"
1,1,1,What films featured the character Popeye Doyle ?,"[films, featured, character, popeye, doyle]"
2,0,0,How can I find a list of celebrities ' real na...,"[find, list, celebrities, real, names]"
3,1,2,What fowl grabs the spotlight after the Chines...,"[fowl, grabs, spotlight, chinese, year, monkey]"
4,2,3,What is the full form of .com ?,"[full, form, com]"


In [8]:
max_length = train_set['cleaned_text'].str.len().max()

# get the embedding shape of the model
embed_shape = len(w2v['test'])
average_word_embeddings = []

for index, row in train_set.iterrows():

    sentence = row['cleaned_text']

    # get word embedding of each word
    word_embeddings = []

    for word in sentence:
        # check if the word is present in the model
        if word in w2v.key_to_index:
            word_embeddings.append(w2v[word])
        else:
             word_embeddings.append(np.zeros(shape=(embed_shape)))

    # perform averaging of word embeddings
    awe = np.mean(word_embeddings, axis = 0)
    average_word_embeddings.append(awe)

train_set['vector'] = average_word_embeddings

train_set.head()


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,label-coarse,label-fine,text,cleaned_text,vector
0,0,0,How did serfdom develop in and then leave Russ...,"[serfdom, develop, leave, russia]","[-0.013671875, -0.05543518, 0.05633545, 0.2869..."
1,1,1,What films featured the character Popeye Doyle ?,"[films, featured, character, popeye, doyle]","[-0.00927734375, 0.07685546875, -0.05764770507..."
2,0,0,How can I find a list of celebrities ' real na...,"[find, list, celebrities, real, names]","[0.029272461, 0.13002929, -0.021777343, 0.1398..."
3,1,2,What fowl grabs the spotlight after the Chines...,"[fowl, grabs, spotlight, chinese, year, monkey]","[0.06305949, 0.03805542, -0.08516184, 0.015625..."
4,2,3,What is the full form of .com ?,"[full, form, com]","[-0.036621094, -0.007965088, -0.08154297, 0.04..."


In [9]:
def aggregate_representations(dataset):
  dataset['cleaned_text'] = dataset['text'].apply(token)
  max_length = dataset['cleaned_text'].str.len().max()

  # get the embedding shape of the model
  embed_shape = len(w2v['test'])
  average_word_embeddings = []

  for index, row in dataset.iterrows():

      sentence = row['cleaned_text']

      # get word embedding of each word
      word_embeddings = []

      for word in sentence:
          # check if the word is present in the model
          if word in w2v.key_to_index:
              word_embeddings.append(w2v[word])
          else:
              word_embeddings.append(np.zeros(shape=(embed_shape)))

      # perform averaging of word embeddings
      awe = np.mean(word_embeddings, axis = 0)

      average_word_embeddings.append(awe)

  dataset['vector'] = average_word_embeddings

  dataset = pd.concat([train_set, train_set['vector'].apply(lambda x: pd.Series(x))], axis=1)

  return dataset

dev_set = aggregate_representations(dev_set)
train_set = aggregate_representations(train_set)

train_set

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,label-coarse,label-fine,text,cleaned_text,vector,0,1,2,3,4,...,290,291,292,293,294,295,296,297,298,299
0,0,0,How did serfdom develop in and then leave Russ...,"[serfdom, develop, leave, russia]","[-0.013671875, -0.05543518, 0.05633545, 0.2869...",-0.013672,-0.055435,0.056335,0.286987,-0.012817,...,-0.249939,0.277100,-0.066833,0.108032,-0.099915,-0.030396,-0.114136,0.048099,0.092072,0.074524
1,1,1,What films featured the character Popeye Doyle ?,"[films, featured, character, popeye, doyle]","[-0.00927734375, 0.07685546875, -0.05764770507...",-0.009277,0.076855,-0.057648,0.125146,0.076880,...,-0.095728,0.041797,-0.101270,-0.003284,0.016504,-0.080591,0.035718,-0.049768,-0.037793,0.058252
2,0,0,How can I find a list of celebrities ' real na...,"[find, list, celebrities, real, names]","[0.029272461, 0.13002929, -0.021777343, 0.1398...",0.029272,0.130029,-0.021777,0.139868,-0.089648,...,0.121045,0.068848,-0.173242,-0.005151,0.018604,-0.103085,0.148145,0.016211,-0.042261,-0.036743
3,1,2,What fowl grabs the spotlight after the Chines...,"[fowl, grabs, spotlight, chinese, year, monkey]","[0.06305949, 0.03805542, -0.08516184, 0.015625...",0.063059,0.038055,-0.085162,0.015625,-0.072367,...,-0.067118,0.119939,-0.064402,0.045369,0.052500,-0.031352,0.024129,-0.016764,0.100647,-0.012685
4,2,3,What is the full form of .com ?,"[full, form, com]","[-0.036621094, -0.007965088, -0.08154297, 0.04...",-0.036621,-0.007965,-0.081543,0.040934,-0.125732,...,-0.018311,0.028564,-0.029806,-0.025960,-0.069010,-0.023275,0.061747,0.042867,-0.121297,0.025879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4947,1,14,What 's the shape of a camel 's spine ?,"[shape, camel, spine]","[-0.0081380205, 0.15018718, -0.16389973, -0.10...",-0.008138,0.150187,-0.163900,-0.106496,-0.166667,...,-0.286296,0.071208,-0.016764,-0.040283,-0.005208,0.173991,-0.025716,0.273600,-0.012207,-0.010235
4948,1,46,What type of currency is used in China ?,"[type, currency, used, china]","[-0.08856201, 0.027709961, -0.017578125, 0.199...",-0.088562,0.027710,-0.017578,0.199829,-0.134949,...,-0.118896,0.063400,-0.041016,0.050514,0.052826,-0.041382,0.116051,0.094116,0.016174,0.000977
4949,4,41,What is the temperature today ?,"[temperature, today]","[-0.1574707, -0.010375977, -0.0075683594, -0.0...",-0.157471,-0.010376,-0.007568,-0.028320,-0.036072,...,-0.046936,0.046387,-0.136719,0.133674,0.122314,-0.099609,-0.073608,0.133301,0.076477,-0.032227
4950,4,41,What is the temperature for cooking ?,"[temperature, cooking]","[-0.19091797, 0.1459961, -0.004272461, 0.10058...",-0.190918,0.145996,-0.004272,0.100586,0.017090,...,-0.163574,0.000977,-0.179199,0.211914,0.146484,-0.132812,-0.026001,0.057129,0.082275,0.162109


### Prepare data for training

In [10]:
def encode_y_label(df, num_classes): # one hot encoding
  encoded_labels = df['label-coarse'].apply(lambda label: list(np.eye(num_classes)[label]))
  df['encoded-coarse-label'] = encoded_labels
  return df
  # print(df['encoded_coarse_label'])

train_set = encode_y_label(train_set, 5)
dev_set = encode_y_label(dev_set, 5)

In [11]:
# split into X and y
X_train = train_set.drop(columns=['label-coarse', 'label-fine', 'text', 'cleaned_text', 'vector', 'encoded-coarse-label']).to_numpy()
y_train = train_set['encoded-coarse-label']

X_dev = dev_set.drop(columns=['label-coarse', 'label-fine', 'text', 'cleaned_text', 'vector', 'encoded-coarse-label']).to_numpy()
y_dev = dev_set['encoded-coarse-label']

# without one hot
# X_train = train_set.drop(columns=['label-coarse', 'label-fine', 'text', 'cleaned_text', 'vector']).to_numpy()
# y_train = train_set['label-coarse']

# X_dev = dev_set.drop(columns=['label-coarse', 'label-fine', 'text', 'cleaned_text', 'vector']).to_numpy()
# y_dev = dev_set['label-coarse']

# SCALING?

# mini-batch strategy
batch_size = 32

# def collate_fn(data):
#     x, y = data
#     zipped = zip(x, y)
#     return list(zipped)

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X =torch.tensor(X, dtype=torch.float64)
        self.y =torch.tensor(y)

    def __len__(self):
        return len(self.y)

    def __getitem__(self,idx):
        return self.X[idx], self.y[idx]

train_data = CustomDataset(X_train, y_train)
dev_data = CustomDataset(X_dev, y_dev)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=True)

##  LSTM model

In [12]:
# fed into the softmax classifier to predict the final label
class LSTMModel(nn.Module):
    def __init__(self, input_d, hidden_d, layer_d, output_d):
        super(LSTMModel, self).__init__()

        self.hidden_dim = hidden_d
        self.layer_dim = layer_d

        # LSTM model
        # self.lstm = nn.LSTM(input_d, hidden_d, layer_d, batch_first=True)

        self.lstm = nn.LSTM(input_d, hidden_d, layer_d)

        self.head = nn.Sequential(
            nn.Linear(hidden_d, output_d),
            nn.Softmax(dim = 1) # softmax activation for label prediction
        )
        
    def forward(self, x, require_grad = True):
        '''
        lstm(input,(h_0, c_0)): shape for each param
        input = (batch_size, seq_len, features) (with batch_first = True)
        h_0 = (D * num_layers, batch_size, project_size or hidden size)
        c_0 = (D * num_layers, batch_size, project_size or hidden size)
        D = 2 if is bidirectional
        '''

        # 3D TENSOR
        # x = x.reshape(x.shape[0], 1, 300).float() # convert to 3D tensor

        # if require_grad:
        #   h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        #   c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        # else:
        #   h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, requires_grad=False)
        #   c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, requires_grad=False)

        x = x.float()

        # 2D TENSOR
        if require_grad:
          h0 = torch.zeros(self.layer_dim, self.hidden_dim).requires_grad_()
          c0 = torch.zeros(self.layer_dim, self.hidden_dim).requires_grad_()
        else:
          h0 = torch.zeros(self.layer_dim, self.hidden_dim, requires_grad=False)
          c0 = torch.zeros(self.layer_dim, self.hidden_dim, requires_grad=False)


        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        
        out = self.head(out)
        
        return out

input_dim = embed_shape
hidden_dim = 128
output_dim = 5
layer_dim = 300

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [15]:
# REFERENCE
num_epochs = 100
best_accuracy = 0.0
max_patience = 5
current_patience = 0

for epoch in range(num_epochs):
  model.train()
  total_training_loss = 0.0
  total_dev_loss = 0.0

  for x_batch, y_batch in train_loader:
    outputs = model.forward(x_batch) # forward pass
    optimizer.zero_grad() # calculate the gradient, manually setting to 0

    # obtain the loss function
    print(x_batch)
    print(outputs)
    loss = criterion(outputs, y_batch)

    loss.backward() #calculates the loss of the loss function

    optimizer.step() #improve from loss, i.e backprop

    total_training_loss += loss.item()

  average_loss = total_training_loss / len(train_loader)
  print("Epoch: %d, training set loss: %1.5f" % (epoch, average_loss))

  # evaluate on dev set
  model.eval()
  with torch.no_grad():
    all_true_labels = []
    all_predicted_labels = []
    for inputs in dev_loader:
      x_batch, y_batch = inputs

      # forward pass
      y_pred = model.forward(x_batch, require_grad=False)
      loss = criterion(y_pred, y_batch)

      total_dev_loss += loss.item()

      predicted_labels = torch.argmax(y_pred, dim=1)

      all_true_labels.extend(y_batch.tolist())
      all_predicted_labels.extend(predicted_labels.tolist())

    average_dev_loss = total_dev_loss / len(dev_loader)
    print("Epoch: %d, dev set loss: %1.5f" % (epoch, average_dev_loss))

  def calculate_accuracy(y_true, y_pred):
      y_true = y_true.detach().cpu().numpy()
      y_pred = y_pred.detach().cpu().numpy()
      return accuracy_score(y_true, y_pred)

  accuracy = calculate_accuracy(torch.tensor(all_true_labels), torch.tensor(all_predicted_labels))
  print(f"Accuracy: {accuracy * 100:.2f}%")

  if accuracy > best_accuracy:
    best_accuracy = accuracy
    best_model = model.state_dict()
  else:
    current_patience += 1
    if current_patience >= max_patience:
        print(f"Early stopping after {epoch} epochs.")
        break


tensor([[-6.4682e-02,  2.5004e-02, -2.8341e-02,  ..., -5.6062e-02,
         -2.8844e-02,  1.2207e-04],
        [ 1.4253e-01,  1.5518e-01,  1.1841e-02,  ..., -1.9092e-02,
          7.2412e-02,  1.2817e-01],
        [-4.2114e-03,  9.3506e-02, -3.0334e-02,  ...,  1.1401e-01,
          1.1340e-01,  5.4504e-02],
        ...,
        [ 8.0914e-02, -1.5500e-01,  9.5215e-02,  ..., -4.9149e-02,
         -8.4229e-02, -4.4891e-02],
        [ 1.4307e-01,  1.3721e-01, -1.6455e-01,  ..., -2.6855e-02,
         -6.4697e-02, -6.7215e-03],
        [ 2.3483e-01, -6.5094e-02,  5.0659e-03,  ..., -3.6633e-02,
          1.0474e-01, -3.1433e-02]], dtype=torch.float64)
tensor([[nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan

ValueError: Classification metrics can't handle a mix of multilabel-indicator and binary targets