In [53]:
# initialization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gensim.downloader
import re
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

import torch
import torch.nn as nn

nltk.download('punkt')

np.random.seed(42)

[nltk_data] Downloading package punkt to /Users/fionchai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Import data

In [37]:
%%script false --no-raise-error # comment this if you dont have teh dev_set

# import dataset
train_set = pd.read_csv('train.csv')

test_set = pd.read_csv('test.csv')

# from train_set sample development set
dev_set = train_set.sample(n=500, replace=False)

# remove dev set from train set
train_set = train_set.drop(dev_set.index)

# check
print(train_set.shape, dev_set.shape, test_set.shape)

# save to dataframe
dev_set.to_csv("dev_set.csv", index=False)
train_set.to_csv("train_set_modified.csv", index=False)

(4952, 3) (500, 3) (500, 3)


In [68]:
train_set = pd.read_csv('train_set_modified.csv')
dev_set = pd.read_csv('dev_set.csv')
test_set = pd.read_csv('test.csv')

## Preprocess data (averaging over word representations)

In [26]:
# word2vec

# download the word2vec-google-news-300
w2v = gensim.downloader.load('word2vec-google-news-300')

In [69]:
# select four classes: 0, 1, 2, 3
# 4 and 5 will be OTHERS (4)

# for train_set
train_set.loc[train_set['label-coarse'] > 4, 'label-coarse'] = 4

# for dev_set
dev_set.loc[dev_set['label-coarse'] > 4, 'label-coarse'] = 4

# for test_set
test_set.loc[test_set['label-coarse'] > 4, 'label-coarse'] = 4


In [70]:
# neural network transforming the input for each word to its final vector representation
def token(sentence):  
      
    # keep only english words
    sentence = re.sub("[^a-zA-Z]"," ",sentence)
    
    # converting to lower case and splitting

    # NEED TO REMOVE STOP WORD?

    token = word_tokenize(sentence.lower())
    return token

In [71]:
train_set['cleaned_text'] = train_set['text'].apply(token)

train_set.head()

Unnamed: 0,label-coarse,label-fine,text,cleaned_text
0,0,0,How did serfdom develop in and then leave Russ...,"[how, did, serfdom, develop, in, and, then, le..."
1,1,1,What films featured the character Popeye Doyle ?,"[what, films, featured, the, character, popeye..."
2,0,0,How can I find a list of celebrities ' real na...,"[how, can, i, find, a, list, of, celebrities, ..."
3,1,2,What fowl grabs the spotlight after the Chines...,"[what, fowl, grabs, the, spotlight, after, the..."
4,2,3,What is the full form of .com ?,"[what, is, the, full, form, of, com]"


In [72]:
max_length = train_set['cleaned_text'].str.len().max()

# get the embedding shape of the model
embed_shape = len(w2v['test'])
average_word_embeddings = []

for index, row in train_set.iterrows():

    sentence = row['cleaned_text']

    # get word embedding of each word
    word_embeddings = []

    for word in sentence:
        # check if the word is present in the model
        if word in w2v.key_to_index:
            word_embeddings.append(w2v[word])
        else:
             word_embeddings.append(np.zeros(shape=(embed_shape)))
    
    # perform averaging of word embeddings
    awe = np.mean(word_embeddings, axis = 0)
    average_word_embeddings.append(awe)

train_set['vector'] = average_word_embeddings

train_set.head()


Unnamed: 0,label-coarse,label-fine,text,cleaned_text,vector
0,0,0,How did serfdom develop in and then leave Russ...,"[how, did, serfdom, develop, in, and, then, le...","[0.058892144097222224, 0.010908338758680556, 0..."
1,1,1,What films featured the character Popeye Doyle ?,"[what, films, featured, the, character, popeye...","[0.024762834821428572, 0.06901332310267858, -0..."
2,0,0,How can I find a list of celebrities ' real na...,"[how, can, i, find, a, list, of, celebrities, ...","[0.02879638671875, 0.0680908203125, 0.01679687..."
3,1,2,What fowl grabs the spotlight after the Chines...,"[what, fowl, grabs, the, spotlight, after, the...","[0.0709991455078125, 0.04058837890625, 0.00234..."
4,2,3,What is the full form of .com ?,"[what, is, the, full, form, of, com]","[0.016701834542410716, 0.00023978097098214285,..."


### Prepare data for training

In [74]:
# split into X and y
X_train = train_set['vector']
y_train = train_set['label-coarse']

X_dev = train_set['vector']
y_dev = train_set['label-coarse']

# SCALING?

# MINI-BATCH

##  LSTM model

In [75]:
# fed into the softmax classifier to predict the final label
class LSTMModel(nn.Module):
    def __init__(self, input_d, hidden_d, layer_d, output_d):
        super(LSTMModel, self).__init__()
        
        self.hidden_dim = hidden_d
        self.layer_dim = layer_d

        # LSTM model 
        self.lstm = nn.LSTM(input_d, hidden_d, layer_d, batch_first=True) 

        self.fc = nn.Softmax(dim = 1) # NOT VERY SURE ABOUT THIS

    def forward(self, x):
    
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        out = self.fc(out[:, -1, :]) 
        return out
    
input_dim = embed_shape
hidden_dim = 128
output_dim = 1
layer_dim = 1

model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
# REFERENCE
num_epochs = 100

for epoch in range(num_epochs):
  outputs = model.forward(X_train_tensors_final) #forward pass
  optimizer.zero_grad() #caluclate the gradient, manually setting to 0
 
  # obtain the loss function
  loss = criterion(outputs, y_train_tensors)
 
  loss.backward() #calculates the loss of the loss function
 
  optimizer.step() #improve from loss, i.e backprop
  if epoch % 100 == 0:
    print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))