# DS-GA 1011 Fall 2017 HW 01
# Bags of Words and Document Classification

In [1]:
import numpy as np
import multiprocessing
import os
import torch
import collections
from collections import Counter
from sklearn.feature_extraction import stop_words
from torch.utils.data import Dataset
import torch.nn as nn
import torchvision.transforms as transforms
from torch.autograd import Variable

In [2]:
# (some of the) hyper parameters
learning_rate = 0.001
vocab_size = 20000 # number words in the vocabulary base
emb_dim = 100 # dimension for n-gram embedding
num_epochs = 5 # number epoch to train
batch_size = 32
ngram_n = 2 # the n in n-gram

In [3]:
# I/O Param
data_dir = "./aclImdb/"
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
TRAIN_SIZE = 23000
VALIDATION_SIZE = 2000
TEST_SIZE = 25000
PADDING_IDX = 0

## Part I : Data I/O
Read data from disk and parse them into desired structures

In [4]:
class IMDBDatum():
    """
    Class that represents a train/validation/test datum
    - self.raw_text
    - self.label: 0 neg, 1 pos
    - self.file_name: dir for this datum
    - self.tokens: list of tokens
    - self.token_idx: index of each token in the text
    """
    def __init__(self, raw_text, label, file_name):
        self.raw_text = raw_text
        self.label = label
        self.file_name = file_name
        
    def set_ngram(self, ngram_ctr):
        self.ngram = ngram_ctr
    
    def set_token_idx(self, token_idx):
        self.token_idx = token_idx
        
    def set_tokens(self, tokens):
        self.tokens = tokens


def preprocess_text(text):
    """
    Function that cleans the string
    """
    text = text.lower().replace("<br />", "")
    return text
        
    
def read_file_as_datum(file_name, label):
    """
    Function that reads a file 
    """
    with open(file_name, "r") as f:
        content = f.read()
        content = preprocess_text(content)
    return IMDBDatum(raw_text=content, label=label, file_name=file_name)


def construct_dataset(dataset_dir, dataset_size, offset=0):
    """
    Function that loads a dataset
    @param offset: skip first offset items in this dir
    """
    pos_dir = os.path.join(dataset_dir, "pos")
    neg_dir = os.path.join(dataset_dir, "neg")
    single_label_size = int(dataset_size / 2)
    output = []
    all_pos = os.listdir(pos_dir)
    all_neg = os.listdir(neg_dir)
    for i in range(offset, offset+single_label_size):
        output.append(read_file_as_datum(os.path.join(pos_dir, all_pos[i]), 1))
        output.append(read_file_as_datum(os.path.join(neg_dir, all_neg[i]), 0))
    return output


In [5]:
# Load Dataset - should take less than 1 min
train_set = construct_dataset(train_dir, TRAIN_SIZE)
validation_set = construct_dataset(train_dir, VALIDATION_SIZE, offset=int(TRAIN_SIZE/2))
test_set = construct_dataset(test_dir, TEST_SIZE)

## Part II: Feature Engineering - Bag of N-gram

You will find the Python Counter object very helpful in this part of the assignment.

Please refer to https://docs.python.org/2/library/collections.html#collections.Counter for more info.

In [6]:
def extract_ngram_from_text(text, n, remove_stopwords=True):
    """
    Function that retrieves all n-grams from the input string
    @param text: raw string
    @param n: integer that tells the model to retrieve all k-gram where k<=n
    @return ngram_counter: a counter that maps n-gram to its frequency
    @return tokens: a list of parsed ngrams
    """
    # tokenize words - for simplicity just split by space
    tokens = text.split(" ")
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words.ENGLISH_STOP_WORDS]
        
    all_ngrams = []
    all_ngrams.append(tokens)
    
    for i in np.arange(2, n+1):
        ngram = list(zip(*[tokens[j:] for j in range(i)]))
        all_ngrams.append(ngram)
    
    all_ngrams = [item for sublist in all_ngrams for item in sublist]
    ngram_counter = Counter()
    
    for gram in all_ngrams:
        ngram_counter[gram] += 1

    return ngram_counter, all_ngrams


def construct_ngram_indexer(ngram_counter_list, topk):
    """
    Function that selects the most common topk ngrams
    @param ngram_counter_list: list of counters
    @param topk, int: # of 
    @return ngram2idx: a dictionary that maps ngram to an unique index
    """
    # TODO: fill in your code here
    # find the top k ngram
    # maps the ngram to an unique index
        
    d = []
    ngram_indexer = {}
    idx = np.arange(2, topk+2)
    
    ngram_counter = Counter()
    for n in ngram_counter_list:
        ngram_counter.update(n)

    top = ngram_counter.most_common(topk)
    for i, key in zip(np.arange(topk), top):
        ngram_indexer[key[0]] = idx[i]
    return ngram_indexer


def token_to_index(tokens, ngram_indexer):
    """
    Function that transform a list of tokens to a list of token index.
    @param tokens: list of ngram
    @param ngram_indexer: a dictionary that maps ngram to an unique index
    """
    # TODO: replace with your code
    # Please DO NOT assign any ngram to index 0 which is reserved for PAD token
    index_list = []
    
    for token in tokens:
        if token in ngram_indexer:
            index_list.append(ngram_indexer[token])
        else:
            index_list.append(1)
    return index_list


def process_text_dataset(dataset, n, topk=None, ngram_indexer=None):
    """
    Top level function that encodes each datum into a list of ngram indices
    @param dataset: list of IMDBDatum
    @param n: n in "n-gram"
    @param topk: #
    @param ngram_indexer: a dictionary that maps ngram to an unique index
    """
    # extract n-gram
    for i in range(len(dataset)):
        text_datum = dataset[i].raw_text
        ngrams, tokens = extract_ngram_from_text(text_datum, n)
        dataset[i].set_ngram(ngrams)
        dataset[i].set_tokens(tokens)
    # select top k ngram
    if ngram_indexer is None:
        ngram_indexer = construct_ngram_indexer([datum.ngram for datum in dataset], topk)
    # vectorize each datum
    for i in range(len(dataset)):
        dataset[i].set_token_idx(token_to_index(dataset[i].tokens, ngram_indexer))  
    return dataset, ngram_indexer

In [7]:
dat = train_set[0].raw_text
tokens = dat.split(" ")
tokens = [token for token in tokens if token not in stop_words.ENGLISH_STOP_WORDS]

dat2 = train_set[1].raw_text
tokens2 = dat2.split(" ")
tokens2 = [token for token in tokens2 if token not in stop_words.ENGLISH_STOP_WORDS]

In [8]:
a, b = extract_ngram_from_text(dat, 2)
a2, b2 = extract_ngram_from_text(dat2, 2)

In [9]:
c = construct_ngram_indexer([a, a2], 4)

In [10]:
d = token_to_index(tokens, c)

In [11]:
train_data, train_ngram_indexer = process_text_dataset(train_set, ngram_n, vocab_size)

In [12]:
validation_data, _ = process_text_dataset(validation_set, ngram_n, ngram_indexer=train_ngram_indexer)

In [13]:
test_data, _ = process_text_dataset(test_set, ngram_n, ngram_indexer=train_ngram_indexer)

## Part III: Construct Input Pipeline for PyTorch

In [14]:
class IMDBDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list):
        """
        @param data_list: list of IMDBDatum
        """
        self.data_list = data_list
        
    def __len__(self):
        return len(self.data_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        token_idx, label = self.data_list[key].token_idx, self.data_list[key].label
        return (token_idx, len(token_idx)), label
    

def imdb_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        label_list.append(datum[1])
        length_list.append(datum[0][1])
    max_length = np.max(length_list)
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0][0]), 
                                pad_width=((0,max_length-datum[0][1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]
    

# consturct datasets
imdb_train = IMDBDataset(train_data)
imdb_validation = IMDBDataset(validation_data)
imdb_test = IMDBDataset(test_data)    
    
# construct data loader
train_loader = torch.utils.data.DataLoader(dataset=imdb_train, 
                                           batch_size=batch_size,
                                           collate_fn=imdb_collate_func,
                                           shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset=imdb_validation, 
                                           batch_size=batch_size, 
                                           collate_fn=imdb_collate_func,
                                           shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=imdb_test, 
                                           batch_size=batch_size,
                                           collate_fn=imdb_collate_func,
                                           shuffle=False)

In [15]:
print("This is an training sample: {0}".format(imdb_train[0][0]))
print("This is a label: {0}".format(imdb_train[0][1]))

This is an training sample: ([1, 160, 1053, 1089, 2020, 11, 9731, 292, 547, 1, 7830, 64, 5717, 8921, 293, 104, 1, 1, 2379, 2421, 618, 1, 1, 2452, 1, 8155, 1616, 102, 1427, 1, 1, 1, 5362, 2816, 8320, 404, 9732, 74, 273, 1416, 4537, 534, 4379, 2870, 1098, 1, 1, 1, 8321, 238, 5813, 1, 39, 13320, 1, 1, 3122, 1, 8321, 334, 1816, 555, 16, 1, 160, 89, 1, 2638, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19298, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 137)
This is a label: 1


## Part IV: Define Model

Please refers to https://arxiv.org/abs/1607.01759 for Fast Text model (Joulin et al.)

In [38]:
class FastText(nn.Module):
    """
    FastText model
    """
    def __init__(self, vocab_size, emb_dim):
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(FastText, self).__init__()
        # TODO: replace with your code
        # Note that the # of inputs dimension for embedding shall be vocab_size+1, why?
        # In the embedding, you need to set the padding_dx argument.
        # Please see http://pytorch.org/docs/master/nn.html
        
        self.embed = nn.Embedding(vocab_size+2, emb_dim, padding_idx=0)
        self.dummy_layer = nn.Linear(emb_dim,1)
        
    
    def forward(self, data, length):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        # TODO: replace with your code
        # Note that instead of doing tensorwise mean, you need to calculate the sum and divide by the sentence length
        # in the length tensor.
        # The output of this function should be a Tensor of dimension (batch_size). Each of the dimension contains
        # a float in the range [0,1] that denotes the probability that the sample is positive (1).
        self.out = self.embed(data)
        self.out = torch.sum(self.out, dim=1) / length.unsqueeze(1).float()
        self.out = self.dummy_layer(self.out.float())
        return nn.functional.sigmoid(self.out.view(-1))

model = FastText(vocab_size, emb_dim)

## Part V: Define Loss Function and Optmizer

In [39]:
# Loss and Optimizer
criterion = nn.BCELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

## Part VI: Train and Test the Model

In [40]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = Variable(data), Variable(lengths), Variable(labels)
        outputs = model(data_batch, length_batch)
        predicted = (outputs.data > 0.5).long().view(-1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
    model.train()
    return (100 * correct / total)


def earily_stop(val_acc_history, t=2, required_progress=0.01):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    # TODO: add your code here
    return False



# Training the Model
validation_acc_history = []
stop_training = False
for epoch in range(num_epochs):
    for i, (data, lengths, labels) in enumerate(train_loader):
        data_batch, length_batch, label_batch = Variable(data), Variable(lengths), Variable(labels)
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch.float())
        loss.backward()
        optimizer.step()
        # report performance
        if (i+1) % (batch_size*4) == 0:
            train_acc = test_model(train_loader, model)
            val_acc = test_model(validation_loader, model)
            print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Train Acc: {5}, Validation Acc:{6}'.format( 
                   epoch+1, num_epochs, i+1, len(imdb_train)//batch_size, loss.data[0], 
                    train_acc, val_acc))
            validation_acc_history.append(val_acc)
            # check if we need to earily stop the model
            stop_training = earily_stop(validation_acc_history)
            if stop_training:
                print("earily stop triggered")
                break
    # because of the the nested loop
    if stop_training:
        break

Epoch: [1/5], Step: [128/718], Loss: 0.6859496235847473, Train Acc: 66.24347826086957, Validation Acc:65.95
Epoch: [1/5], Step: [256/718], Loss: 0.6824873685836792, Train Acc: 73.17391304347827, Validation Acc:70.05
Epoch: [1/5], Step: [384/718], Loss: 0.672297477722168, Train Acc: 59.947826086956525, Validation Acc:57.75
Epoch: [1/5], Step: [512/718], Loss: 0.6432124376296997, Train Acc: 76.1304347826087, Validation Acc:72.35
Epoch: [1/5], Step: [640/718], Loss: 0.6155011057853699, Train Acc: 77.01304347826087, Validation Acc:74.45
Epoch: [2/5], Step: [128/718], Loss: 0.5542322993278503, Train Acc: 83.80869565217391, Validation Acc:80.35
Epoch: [2/5], Step: [256/718], Loss: 0.49113184213638306, Train Acc: 84.20869565217392, Validation Acc:80.3
Epoch: [2/5], Step: [384/718], Loss: 0.577803909778595, Train Acc: 85.6608695652174, Validation Acc:81.95
Epoch: [2/5], Step: [512/718], Loss: 0.4240523874759674, Train Acc: 86.39565217391305, Validation Acc:82.85
Epoch: [2/5], Step: [640/718], 

In [41]:
# Test the Model
print('Accuracy of the model on the test set: %d %%' % test_model(test_loader, model))

# Save the Model
#torch.save(net.state_dict(), 'model.pkl')

Accuracy of the model on the test set: 86 %


## Studying Embeddings

In [None]:
# error analysis

## Part VIII: Parameter Tunning

In [None]:
# param tuning

## Reference

Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). Bag of tricks for efficient text classification. arXiv preprint arXiv:1607.01759.