In [26]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import nltk
nltk.download("all")
import matplotlib.pyplot as plt
import torch

%matplotlib inline

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/arise/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/arise/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/arise/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/arise/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /home/arise/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]

In [27]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


In [28]:
import data
train_df, test_df = data.get_data()

In [29]:
train_sentences = train_df.sentence.values
train_labels = train_df.label.values

test_sentences = test_df.sentence.values
test_labels = test_df.label.values

In [30]:
import gensim
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../model/GoogleNews-vectors-negative300.bin', binary=True)

In [31]:
from nltk.tokenize import word_tokenize
from collections import defaultdict

def tokenize(texts):
    """Tokenize texts, build vocabulary and find maximum sentence length.
    
    Args:
        texts (List[str]): List of text data
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of tokens
        word2idx (Dict): Vocabulary built from the corpus
        max_len (int): Maximum sentence length
    """

    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Building our vocab from the corpus starting from index 2
    idx = 2
    for sent in texts:
        tokenized_sent = word_tokenize(sent)

        # Add `tokenized_sent` to `tokenized_texts`
        tokenized_texts.append(tokenized_sent)

        # Add new token to `word2idx`
        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1

        # Update `max_len`
        max_len = max(max_len, len(tokenized_sent))

    return tokenized_texts, word2idx, max_len

def encode(tokenized_texts, word2idx, max_len):
    """Pad each sentence to the maximum sentence length and encode tokens to
    their index in the vocabulary.

    Returns:
        input_ids (np.array): Array of token indexes in the vocabulary with
            shape (N, max_len). It will the input of our CNN model.
    """

    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

In [32]:
# Tokenize, build vocabulary, encode tokens
print("Tokenizing...\n")
tokenized_texts, word2idx, max_len = tokenize(train_sentences)
input_ids = encode(tokenized_texts, word2idx, max_len)

Tokenizing...



In [33]:
word_list = list(word2idx.keys())
id_list = list(word2idx.values())

In [34]:
def load_pretrained_vectors():
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), 300))
    embeddings[word2idx['<pad>']] = np.zeros((300,))

    word_list = list(word2idx.keys())
    id_list = list(word2idx.values())

    # Load pretrained vectors
    count = 0
    for i in range(len(word2idx)):
        word_position = id_list.index(i)
        word = word_list[word_position]

        if word in word2vec_model:
            count += 1
            embeddings[word2idx[word]] = word2vec_model[word]

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

In [35]:
# Load pretrained vectors
embeddings = load_pretrained_vectors()
embeddings = torch.tensor(embeddings)

There are 8943 / 16904 pretrained vectors found.


In [36]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)

def data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=50):
    """Convert train and validation sets to torch.Tensors and load them to
    DataLoader.
    """

    # Convert data type to torch.Tensor
    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    # Specify batch_size
    batch_size = 50

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, drop_last=True)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size, drop_last=True)

    return train_dataloader, val_dataloader

In [37]:
input_ids.shape
train_labels.shape


(14596,)

In [38]:
from sklearn.model_selection import train_test_split

# Train Test Split
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, train_labels, test_size=0.1, random_state=42)

# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = \
data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

In [39]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_NLP(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification."""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN_NLP class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

In [40]:
import torch.optim as optim

def initilize_model(pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=2,
                    dropout=0.5,
                    learning_rate=0.01):
    """Instantiate a CNN model and an optimizer."""

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and \
    num_filters need to be of the same length."

    # Instantiate CNN model
    cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)
    
    # Send model to `device` (GPU/CPU)
    cnn_model.to(device)

    # Instantiate Adadelta optimizer
    optimizer = optim.Adadelta(cnn_model.parameters(),
                               lr=learning_rate,
                               rho=0.95)

    return cnn_model, optimizer

In [41]:
# PyTorch TensorBoard support
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/usairlineCNN{}'.format(timestamp))

In [42]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility."""

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, train_dataloader, title, val_dataloader=None, epochs=10):
    """Train the CNN model."""
    
    # Tracking best validation accuracy
    best_accuracy = 0

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================

        # Tracking time and loss
        t0_epoch = time.time()
        test_total_loss = 0

        # Put the model into the training mode
        model.train()
        train_accuracy = []
        for step, batch in enumerate(train_dataloader):
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            test_total_loss += loss.item()

            # Get the predictions
            preds = torch.argmax(logits, dim=1).flatten()

            # Calculate the accuracy rate
            accuracy = (preds == b_labels).cpu().numpy().mean() * 100
            train_accuracy.append(accuracy)

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters
            optimizer.step()

        # Calculate the average loss over the entire training data
        avg_train_loss = test_total_loss / len(train_dataloader)
        train_accuracy = np.mean(train_accuracy)

        # =======================================
        #               Evaluation
        # =======================================
        if val_dataloader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Track the best accuracy
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")

        writer.add_scalars(title + '-CNN Training vs. Testing Loss',
                { 'Train' : avg_train_loss, 'Test' : val_loss },
                epoch_i + 1)

        writer.add_scalars(title + '-CNN Training vs. Testing Accuracy',
                    { 'Train' : train_accuracy, 'Test' : val_accuracy },
                    epoch_i + 1)
            
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's
    performance on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled
    # during the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [43]:
# CNN-rand: Word vectors are randomly initialized.
set_seed(42)
cnn_rand, optimizer = initilize_model(vocab_size=len(word2idx),
                                      embed_dim=300,
                                      learning_rate=0.25,
                                      dropout=0.5)
train(cnn_rand, optimizer, train_dataloader, 'rand_vec', val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.574948   |  0.502261  |   76.34   |   2.30   
   2    |   0.442119   |  0.452030  |   78.97   |   1.87   
   3    |   0.362901   |  0.430744  |   80.21   |   1.88   
   4    |   0.297243   |  0.422141  |   81.45   |   1.89   
   5    |   0.237478   |  0.422592  |   81.66   |   1.89   
   6    |   0.187142   |  0.441144  |   81.86   |   1.88   
   7    |   0.149949   |  0.447847  |   82.83   |   1.97   
   8    |   0.122551   |  0.453780  |   81.79   |   1.98   
   9    |   0.094822   |  0.478669  |   81.72   |   1.94   
  10    |   0.084688   |  0.495676  |   82.07   |   1.91   
  11    |   0.068625   |  0.531014  |   82.55   |   1.95   
  12    |   0.057375   |  0.562605  |   83.10   |   1.95   
  13    |   0.051807   |  0.550617  |   83.03   |   1.94   
  14    |   0.044439   |  0.577957  |   82.90   |   1.90   
  15    |   0.040238

In [44]:
# CNN-static: fastText pretrained word vectors are used and freezed during training.
set_seed(42)
cnn_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                        freeze_embedding=True,
                                        learning_rate=0.25,
                                        dropout=0.5)
train(cnn_static, optimizer, train_dataloader, 'gg_word2vec', val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.534725   |  0.458683  |   78.76   |   0.91   
   2    |   0.391394   |  0.397782  |   83.45   |   0.89   
   3    |   0.338035   |  0.384080  |   84.00   |   0.86   
   4    |   0.297857   |  0.393524  |   83.10   |   0.86   
   5    |   0.267394   |  0.385969  |   84.90   |   0.87   
   6    |   0.234047   |  0.389440  |   83.10   |   0.86   
   7    |   0.202245   |  0.394273  |   83.52   |   0.86   
   8    |   0.176988   |  0.399694  |   83.24   |   0.86   
   9    |   0.148689   |  0.406335  |   84.14   |   0.86   
  10    |   0.127864   |  0.414780  |   84.55   |   0.87   
  11    |   0.105875   |  0.438780  |   83.93   |   0.86   
  12    |   0.092481   |  0.467248  |   83.79   |   0.87   
  13    |   0.076021   |  0.473932  |   84.55   |   0.87   
  14    |   0.068990   |  0.482593  |   84.07   |   0.86   
  15    |   0.057307

In [45]:
# CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
set_seed(42)
cnn_non_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, 'tuned_gg_word2vec', val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.529016   |  0.453789  |   78.83   |   2.81   
   2    |   0.383296   |  0.390180  |   83.93   |   2.79   
   3    |   0.325576   |  0.376485  |   85.03   |   2.80   
   4    |   0.282531   |  0.380557  |   84.00   |   2.79   
   5    |   0.248834   |  0.380114  |   85.17   |   2.79   
   6    |   0.212165   |  0.387991  |   83.93   |   2.80   
   7    |   0.179389   |  0.393564  |   84.14   |   2.79   
   8    |   0.153099   |  0.404649  |   84.41   |   2.82   
   9    |   0.124410   |  0.419786  |   83.66   |   2.83   
  10    |   0.102609   |  0.440722  |   84.28   |   2.88   
  11    |   0.081629   |  0.460732  |   84.28   |   2.80   
  12    |   0.068621   |  0.491559  |   84.34   |   2.79   
  13    |   0.056988   |  0.497450  |   83.86   |   2.79   
  14    |   0.047992   |  0.520813  |   83.93   |   2.78   
  15    |   0.040028

In [46]:
# CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
set_seed(42)
cnn_non_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            filter_sizes=[3],
                                            num_filters=[100],
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, 'reduced_CNN', val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.540639   |  0.439168  |   81.52   |   2.31   
   2    |   0.396037   |  0.396588  |   84.00   |   2.31   
   3    |   0.354514   |  0.384368  |   85.03   |   2.32   
   4    |   0.321131   |  0.379615  |   85.72   |   2.33   
   5    |   0.296500   |  0.378819  |   85.17   |   2.30   
   6    |   0.271226   |  0.381574  |   85.03   |   2.30   
   7    |   0.250206   |  0.386680  |   84.83   |   2.29   
   8    |   0.229986   |  0.390203  |   84.48   |   2.29   
   9    |   0.210313   |  0.394111  |   84.00   |   2.29   
  10    |   0.190057   |  0.409416  |   84.41   |   2.29   
  11    |   0.173217   |  0.434409  |   84.21   |   2.32   
  12    |   0.157128   |  0.426037  |   84.14   |   2.30   
  13    |   0.139500   |  0.441646  |   84.14   |   2.30   
  14    |   0.129934   |  0.459410  |   84.21   |   2.29   
  15    |   0.113840

In [50]:
# CNN-non-static: fastText pretrained word vectors are fine-tuned during training.
set_seed(42)
cnn_non_static, optimizer = initilize_model(pretrained_embedding=embeddings,
                                            freeze_embedding=False,
                                            filter_sizes=[4],
                                            num_filters=[50],
                                            learning_rate=0.25,
                                            dropout=0.5)
train(cnn_non_static, optimizer, train_dataloader, 'reduced_param', val_dataloader, epochs=20)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |   0.553893   |  0.456331  |   80.07   |   2.32   
   2    |   0.423791   |  0.404349  |   84.28   |   2.26   
   3    |   0.373041   |  0.395393  |   84.28   |   2.28   
   4    |   0.339645   |  0.387735  |   84.07   |   2.26   
   5    |   0.309981   |  0.396676  |   84.62   |   2.28   
   6    |   0.285427   |  0.389761  |   84.55   |   2.27   
   7    |   0.264137   |  0.393454  |   84.21   |   2.27   
   8    |   0.241960   |  0.399940  |   83.86   |   2.26   
   9    |   0.218997   |  0.406640  |   83.72   |   2.26   
  10    |   0.205774   |  0.405724  |   83.66   |   2.26   
  11    |   0.182961   |  0.428484  |   83.86   |   2.27   
  12    |   0.166773   |  0.437645  |   83.38   |   2.26   
  13    |   0.154539   |  0.475401  |   83.79   |   2.26   
  14    |   0.144300   |  0.465849  |   83.10   |   2.25   
  15    |   0.131026

In [47]:
def predict(text, model=cnn_non_static.to("cpu"), max_len=62):
    """Predict probability that a review is positive."""

    # Tokenize, pad and encode text
    tokens = word_tokenize(text.lower())
    padded_tokens = tokens + ['<pad>'] * (max_len - len(tokens))
    input_id = [word2idx.get(token, word2idx['<unk>']) for token in padded_tokens]

    # Convert to PyTorch tensors
    input_id = torch.tensor(input_id).unsqueeze(dim=0)

    # Compute logits
    logits = model.forward(input_id)

    #  Compute probability
    probs = F.softmax(logits, dim=1).squeeze(dim=0)

    print(f"This review is {probs[1] * 100:.2f}% negative.")

In [48]:
predict("USAirways did I mention I'm a silver preferred member and fly every week (not always with you), and not even a meal voucher?")
predict("@USAirways 4469, thank you for asking.")
predict("my goodness your people @love field are amazing under pressure ❤️from Texas #beatstheothers in crisis Please fly me to NY")
predict("what is going on with the flight 2417? It has been delay a lot .")


This review is 99.83% negative.
This review is 77.33% negative.
This review is 0.22% negative.
This review is 99.99% negative.


In [49]:
cnn_non_static.modules

<bound method Module.modules of CNN_NLP(
  (embedding): Embedding(16904, 300)
  (conv1d_list): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
  )
  (fc): Linear(in_features=100, out_features=2, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)>