# Comparing NLP Techniques

This project sets out to compare different applications of NLP techniques through the following libraries:

- Tensorflow
- PyTorch

This notebook will focus on `pytorch`

In [7]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from torchtext.legacy import data
import gzip

## Pre Processing

In [2]:
# lets open the data
# sourced from https://registry.opendata.aws/fast-ai-nlp/

column_names = ["CATEGORY", "TITLE", "CONTENT"]
# we use the train.csv only
news_df = pd.read_csv("data/train.csv", names=column_names, header=None, delimiter=",")

# make the category classes more readable
mapping = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}
news_df = news_df.replace({'CATEGORY': mapping})
news_df.head()

Unnamed: 0,CATEGORY,TITLE,CONTENT
0,Business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,Business,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [3]:
# lets check the spread of labels
news_df["CATEGORY"].value_counts()

Business    30000
Sci/Tech    30000
World       30000
Sports      30000
Name: CATEGORY, dtype: int64

In [4]:
# lets encode the target variable

encoder = preprocessing.LabelEncoder()
encoded_y = encoder.fit_transform(news_df["CATEGORY"].values)
num_classes = len(encoder.classes_)

# one hot encode integers
dummy_y = np.eye(num_classes, dtype="float32")[encoded_y]

In [5]:
# now we want to tokenize the title inputs
MAX_VECTOR_LEN = 40

docs = news_df["TITLE"].values

t = data.Field(
    lower       = True,
    tokenize   = "basic_english",
    fix_length  = MAX_VECTOR_LEN
)

docs = list(map(t.preprocess, docs))
padded_docs = t.pad(docs)
t.build_vocab(padded_docs)
print(f"Vocabulary size: {len(t.vocab)}")

numericalized_docs = []
for d in padded_docs:
    temp = []
    for c in d:
        temp.append(t.vocab.stoi[c])
    numericalized_docs.append(temp)

print(f"Number of headlines: {len(numericalized_docs)}")
processed_titles =  np.array(numericalized_docs)

Vocabulary size: 39580
Number of headlines: 120000


In [8]:
#  now we need to create the embedding matrix with only relevant embeddings

embeddings_index = dict()
with gzip.open('./data/cc.en.300.vec.gz', "rt") as zipf:
    firstline = zipf.readline()
    emb_vocab_size, emb_d = firstline.split(" ")
    emb_vocab_size = int(emb_vocab_size)
    emb_d = int(emb_d)
    for line in zipf:
        values = line.split()
        word = values[0]
        # Only load subset of the embeddings recognised by the tokenizer:
        if word in t.vocab.stoi:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
print("Loaded {} of {} word vectors for tokenizer vocabulary length {}".format(
    len(embeddings_index),
    emb_vocab_size,
    len(t.vocab),
))

Loaded 31687 of 2000000 word vectors for tokenizer vocabulary length 39580


In [18]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(t.vocab), emb_d))
for word, i in t.vocab.stoi.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

vocab_size=embedding_matrix.shape[0]

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    processed_titles,
    dummy_y,
    test_size=0.2,
    random_state=1
)

## PyTorch Implementation

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

In [12]:
# reset the seed
seed = 1
np.random.seed(seed)

In [13]:
# define and intialize the neural network

class Net(nn.Module):
    def __init__(self, vocab_size=400000, emb_dim=300, num_classes=4):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv1 = nn.Conv1d(emb_dim, 128, kernel_size=3)
        self.max_pool1d = nn.MaxPool1d(5)
        self.flatten1 = nn.Flatten()
        self.dropout1 = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(896, 128)
        self.fc2 = nn.Linear(128, num_classes)
    
    # x is data that will be passed through the network
    def forward(self, x):
        x = self.embedding(x)  
        x = torch.transpose(x,1,2)
        x = self.flatten1(self.max_pool1d(self.conv1(x)))
        x = self.dropout1(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

In [14]:
# define helper function to train our model

def train(train_loader, embedding_matrix, num_classes=4, epochs=12, learning_rate=0.001):

    # initialise model
    model = Net(
        vocab_size=embedding_matrix.shape[0],
        emb_dim=embedding_matrix.shape[1],
        num_classes=num_classes,
    )
    model.embedding.weight = torch.nn.parameter.Parameter(torch.FloatTensor(embedding_matrix), False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

    # train
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0
        for batch_idx, (X_train, y_train) in enumerate(train_loader, 1):
            data, target = X_train.to(device), y_train.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.binary_cross_entropy(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            n_batches += 1
        print(f"epoch: {epoch}, train_loss: {running_loss / n_batches:.6f}")  # (Avg over batches)
    return model, device

In [15]:
# define helper function to test our model

def test(model, test_loader, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.binary_cross_entropy(output, target, reduction="sum").item()
            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
            target_index = target.max(1, keepdim=True)[1]
            correct += pred.eq(target_index).sum().item()

    test_loss /= len(test_loader.dataset)  # Average loss over dataset samples
    print(f"val_loss: {test_loss:.4f}, val_acc: {correct/len(test_loader.dataset):.4f}") 

In [16]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        """Initialization"""
        self.labels = labels
        self.data = data

    def __len__(self):
        """Denotes the total number of samples"""
        return len(self.data)

    def __getitem__(self, index):
        # Load data and get label
        X = torch.as_tensor(self.data[index]).long()
        y = torch.as_tensor(self.labels[index])
        return X, y

In [21]:
%%time
# fit the model:
epochs = 5
learning_rate = 0.001
trainloader = DataLoader(Dataset(X_train, y_train), batch_size=16, shuffle=True)
testloader = DataLoader(Dataset(X_test, y_test), batch_size=32, shuffle=True)

print("Training model...")
model, device = train(
    trainloader,
    embedding_matrix,
    num_classes=num_classes,
    epochs=epochs,
    learning_rate=learning_rate,
)
print("Evaluating model...")
test(model, testloader, device)

Training model...
epoch: 1, train_loss: 0.237832
epoch: 2, train_loss: 0.201736
epoch: 3, train_loss: 0.186768
epoch: 4, train_loss: 0.176881
epoch: 5, train_loss: 0.168310
Evaluating model...
val_loss: 0.7473, val_acc: 0.8566
CPU times: user 3min 45s, sys: 15.5 s, total: 4min
Wall time: 3min 24s


In [22]:
from IPython import display
import ipywidgets as widgets

def classify(text):
    """Classify a headline and print the results"""
    processed = t.preprocess(text)
    padded = t.pad([processed])
    final_text = []
    for w in padded[0]:
        final_text.append(t.vocab.stoi[w])
    final_text = torch.tensor([final_text])
    model.cpu()
    model.eval()
    with torch.no_grad():
        result = model(final_text)
    print(result)
    ix = np.argmax(result.detach())
    print(f"Predicted class: '{encoder.classes_[ix]}' with confidence {result[0][ix]:.2%}")

In [23]:
interaction = widgets.interact_manual(
   classify,
   text=widgets.Text(
       value="The markets were bullish after news of the merger",
       placeholder="Type a news headline...",
       description="Headline:",
       layout=widgets.Layout(width="99%"),
   )
)
interaction.widget.children[1].description = "Classify!"

interactive(children=(Text(value='The markets were bullish after news of the merger', description='Headline:',…