# Comparing NLP Techniques

This project sets out to compare different applications of NLP techniques through the following libraries:

- Tensorflow
- PyTorch

This notebook will focus on `tensorflow`

## Pre-processing

In [3]:
import os
import numpy as np
import pandas as pd
import subprocess
import gzip
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
import requests

READING_IN_EMBEDDINGS = False


In [4]:
# lets open the data
# sourced from https://registry.opendata.aws/fast-ai-nlp/

column_names = ["CATEGORY", "TITLE", "CONTENT"]
# we use the train.csv only
news_df = pd.read_csv("data/train.csv", names=column_names, header=None, delimiter=",")

# make the category classes more readable
mapping = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}
news_df = news_df.replace({'CATEGORY': mapping})
news_df.head()

Unnamed: 0,CATEGORY,TITLE,CONTENT
0,Business,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,Business,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,Business,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,Business,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,Business,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [5]:
# lets check the spread of labels
news_df["CATEGORY"].value_counts()

Business    30000
Sports      30000
Sci/Tech    30000
World       30000
Name: CATEGORY, dtype: int64

In [6]:
# lets encode the target variable

encoder = preprocessing.LabelEncoder()
encoded_y = encoder.fit_transform(news_df["CATEGORY"].values)
num_classes = len(encoder.classes_)

# one hot encode integers
dummy_y = np.eye(num_classes, dtype="float32")[encoded_y]



In [7]:
MAX_VECTOR_LEN = 40

# now we want to tokenize the title inputs
titles = news_df["TITLE"].values
t = Tokenizer()
t.fit_on_texts(titles)
vocab_size = len(t.word_index) + 1

# integer encode the documents
encoded_titles = t.texts_to_sequences(titles)

# print the results
print(f"Vocabulary size: {vocab_size}")

# make every sequence the same length
print(f"Padding titles to max_length={MAX_VECTOR_LEN} \
        (truncating {sum(1 for doc in encoded_titles if len(doc) > MAX_VECTOR_LEN)} titles)")
padded_docs = pad_sequences(encoded_titles, maxlen=MAX_VECTOR_LEN, padding="post")
print(f"Number of headlines: {len(padded_docs)}")


Vocabulary size: 36758
Padding titles to max_length=40         (truncating 0 titles)
Number of headlines: 120000


In [8]:
# Tokenizer.num_words is nullable, and there's an OOV token, so:
tokenizer_vocab_size = len(t.word_index) + 1

In [8]:
# we need to represent words as numeric values, so lets use pre trained word embeddings from https://fasttext.cc/docs/en/crawl-vectors.html
if not READING_IN_EMBEDDINGS:
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
    r = requests.get(url, allow_redirects=True)
    open('./data/cc.en.300.vec.gz', 'wb').write(r.content)


In [9]:
#  now we need to create the embedding matrix with only relevant embeddings

embeddings_index = dict()
with gzip.open('./data/cc.en.300.vec.gz', "rt") as zipf:
    firstline = zipf.readline()
    emb_vocab_size, emb_d = firstline.split(" ")
    emb_vocab_size = int(emb_vocab_size)
    emb_d = int(emb_d)
    for line in zipf:
        values = line.split()
        word = values[0]
        # Only load subset of the embeddings recognised by the tokenizer:
        if word in t.word_index:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
print("Loaded {} of {} word vectors for tokenizer vocabulary length {}".format(
    len(embeddings_index),
    emb_vocab_size,
    tokenizer_vocab_size,
))


Loaded 29240 of 2000000 word vectors for tokenizer vocabulary length 36758


In [8]:
# now we can delete the big .gz file
if not READING_IN_EMBEDDINGS:
    os.remove('./data/cc.en.300.vec.gz')

In [10]:
# create a weight matrix for words in training docs

embedding_matrix = np.zeros((tokenizer_vocab_size, emb_d))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [11]:
# save our embeddings for later

np.save(
    file="./data/docs-embedding-matrix",
    arr=embedding_matrix,
    allow_pickle=False,
)
vocab_size=embedding_matrix.shape[0]
print(embedding_matrix.shape)

(36758, 300)


In [12]:
# spliting up our data for training and evaluating

X_train, X_test, y_train, y_test = train_test_split(
    padded_docs,
    dummy_y,
    test_size=0.2,
    random_state=1
)

In [12]:
# save the docs

np.save("./data/train_X.npy", X_train)
np.save("./data/train_Y.npy", y_train)
np.save("./data/test_X.npy", X_test)
np.save("./data/test_Y.npy", y_test)

## Tensorflow Implementation

In [15]:
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Embedding, Flatten, MaxPooling1D
from tensorflow.keras.models import Sequential


from IPython import display
import ipywidgets as widgets

In [16]:
seed = 1
np.random.seed(seed)

In [17]:
model = Sequential()
model.add(Embedding(
    embedding_matrix.shape[0],  # Final vocabulary size
    embedding_matrix.shape[1],  # Word vector dimensions
    weights=[embedding_matrix],
    input_length=40,
    trainable=False,
    name="embed"
))
model.add(Conv1D(filters=128, kernel_size=3, activation="relu", name="conv_1"))
model.add(MaxPooling1D(pool_size=5, name="maxpool_1"))
model.add(Flatten(name="flat_1"))
model.add(Dropout(0.3, name="dropout_1"))
model.add(Dense(128, activation="relu", name="dense_1"))
model.add(Dense(num_classes, activation="softmax", name="out_1"))

# Compile the model
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["acc"])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed (Embedding)            (None, 40, 300)           11027400  
_________________________________________________________________
conv_1 (Conv1D)              (None, 38, 128)           115328    
_________________________________________________________________
maxpool_1 (MaxPooling1D)     (None, 7, 128)            0         
_________________________________________________________________
flat_1 (Flatten)             (None, 896)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 896)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               114816    
_________________________________________________________________
out_1 (Dense)                (None, 4)                 5

In [18]:
# fit the model here in the notebook:
print("Training model")
model.fit(X_train, y_train, batch_size=16, epochs=5, verbose=1)
print("Evaluating model")

scores = model.evaluate(X_test, y_test, verbose=2)
print(
    "Validation results: "
    + "; ".join(map(
        lambda i: f"{model.metrics_names[i]}={scores[i]:.5f}", range(len(model.metrics_names))
    ))
)

Training model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Evaluating model
750/750 - 2s - loss: 0.1999 - acc: 0.8573
Validation results: loss=0.19989; acc=0.85729


In [19]:
def classify(text):
    """Classify a headline and print the results"""
    encoded_example = t.texts_to_sequences([text])
    # Pad documents to a max length of 40 words
    max_length = 40
    padded_example = pad_sequences(encoded_example, maxlen=max_length, padding="post")
    result = model.predict(padded_example)
    print(result)
    ix = np.argmax(result)
    print(f"Predicted class: '{encoder.classes_[ix]}' with confidence {result[0][ix]:.2%}")

In [None]:
interaction = widgets.interact_manual(
    classify,
    text=widgets.Text(
        value="The markets were bullish after news of the merger",
        placeholder="Type a news headline...",
        description="Headline:",
        layout=widgets.Layout(width="99%"),
    )
)

interaction.widget.children[1].description = "Classify!"

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

# reset the seed
seed = 1
np.random.seed(seed)

# define and intialize the neural network

class Net(nn.Module):
    def __init__(self, vocab_size=400000, emb_dim=300, num_classes=4):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv1 = nn.Conv1d(emb_dim, 128, kernel_size=3)
        self.max_pool1d = nn.MaxPool1d(5)
        self.flatten1 = nn.Flatten()
        self.dropout1 = nn.Dropout(p=0.3)
        self.fc1 = nn.Linear(896, 128)
        self.fc2 = nn.Linear(128, num_classes)
    
    # x is data that will be passed through the network
    def forward(self, x):
        x = self.embedding(x)  
        x = torch.transpose(x,1,2)
        x = self.flatten1(self.max_pool1d(self.conv1(x)))
        x = self.dropout1(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

# define helper function to train our model

def train(train_loader, embedding_matrix, num_classes=4, epochs=12, learning_rate=0.001):

    # initialise model
    model = Net(
        vocab_size=embedding_matrix.shape[0],
        emb_dim=embedding_matrix.shape[1],
        num_classes=num_classes,
    )
    model.embedding.weight = torch.nn.parameter.Parameter(torch.FloatTensor(embedding_matrix), False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)

    # train
    for epoch in range(1, epochs + 1):
        model.train()
        running_loss = 0.0
        n_batches = 0
        for batch_idx, (X_train, y_train) in enumerate(train_loader, 1):
            data, target = X_train.to(device), y_train.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.binary_cross_entropy(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            n_batches += 1
        print(f"epoch: {epoch}, train_loss: {running_loss / n_batches:.6f}")  # (Avg over batches)
    return model, device

# define helper function to test our model

def test(model, test_loader, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.binary_cross_entropy(output, target, reduction="sum").item()
            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
            target_index = target.max(1, keepdim=True)[1]
            correct += pred.eq(target_index).sum().item()

    test_loss /= len(test_loader.dataset)  # Average loss over dataset samples
    print(f"val_loss: {test_loss:.4f}, val_acc: {correct/len(test_loader.dataset):.4f}") 


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        """Initialization"""
        self.labels = labels
        self.data = data

    def __len__(self):
        """Denotes the total number of samples"""
        return len(self.data)

    def __getitem__(self, index):
        # Load data and get label
        X = torch.as_tensor(self.data[index]).long()
        y = torch.as_tensor(self.labels[index])
        return X, y

In [14]:
%%time
# fit the model:
epochs = 5
learning_rate = 0.001
trainloader = DataLoader(Dataset(X_train, y_train), batch_size=16, shuffle=True)
testloader = DataLoader(Dataset(X_test, y_test), batch_size=32, shuffle=True)

print("Training model...")
model, device = train(
    trainloader,
    embedding_matrix,
    num_classes=num_classes,
    epochs=epochs,
    learning_rate=learning_rate,
)
print("Evaluating model...")
test(model, testloader, device)

Training model...
epoch: 1, train_loss: 0.241407
epoch: 2, train_loss: 0.204257
epoch: 3, train_loss: 0.190396
epoch: 4, train_loss: 0.180207


KeyboardInterrupt: 