# Introduction to NLP

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import bcolz
import re
import itertools
from numpy.random import random, permutation, randn, normal, uniform, choice
import torch
import torch.nn as nn
import torch.nn.functional as F

We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment.

See below to download the dataset.

We will compare to the 2011 ACL [stanford paper](http://ai.stanford.edu/~amaas/data/sentiment/)

In [None]:
#need to be done only once
#!wget -O $data_imdb https://s3.amazonaws.com/text-datasets/imdb_full.pkl

In [None]:
#to be modified
data_folder = '/home/lelarge/courses/data/imdb/'
data_imdb = data_folder+'imdb_full.pkl'
data_idx = data_folder+'idx.pkl'

# Embeddings

We start with a small recap about [embeddings](https://pytorch.org/docs/master/nn.html#embedding)

In [None]:
arr = np.array([[1,2,4,5],[4,3,2,0]])

In [None]:
arr[0,:]

In [None]:
arr.shape

In [None]:
embedding_dim = 3
embedding_user = nn.Embedding(6, embedding_dim)
input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
ex = embedding_user(input)
ex

In [None]:
ex.shape

In [None]:
ex.view(2,12)

In [None]:
ex

In [None]:
ex.permute(0,2,1)

# Sentiment analysis

You will build a simple neural network able to classify texts (reviews of movies) in two classes: positive or negative.

In [None]:
import pickle
f = open(data_imdb,'rb')
(x_train, labels_train), (x_test, labels_test) = pickle.load(f)

In [None]:
len(x_train)

In [None]:
g = open(data_idx, 'rb')
idx = pickle.load(g)

In [None]:
idx.get('river')

The words are sorted according to their frequency, i.e. the most frequent word has the lowest value.

This is the word list:

In [None]:
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

This is the mapping from id to word

In [None]:
idx2word = {v: k for k, v in idx.items()}

Here's the 1st review. As you see, the words have been replaced by ids.

In [None]:
', '.join(map(str, x_train[0]))

The ids can be looked up in idx2word.

In [None]:
idx2word[23022]

In [None]:
' '.join([idx2word[o] for o in x_train[0]])

Reduce vocab size by setting rare words to max index.

In [None]:
vocab_size = 5000

In [None]:
trn = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_train]
test = [np.array([i if i<vocab_size-1 else vocab_size-1 for i in s]) for s in x_test]

Look at distribution of lengths of sentences.

In [None]:
lens = np.array(list(map(len, trn)))
(lens.max(), lens.min(), lens.mean())

Pad (with zero) or truncate each sentence to make consistent length.

In [None]:
seq_len = 500

trn_pad = np.zeros((len(x_train),seq_len)).astype(np.int32)
test_pad = np.zeros((len(x_test),seq_len)).astype(np.int32)

for idx, s in enumerate(trn):
    trunc = s[-seq_len:]
    trn_pad[idx, -len(trunc):] = trunc

for idx, s in enumerate(test):
    trunc = s[-seq_len:]
    test_pad[idx, -len(trunc):] = trunc

In [None]:
trn_pad[:5,:]

In [None]:
trn_pad.shape

In [None]:
test_pad.shape

In [None]:
def gpu(tensor, gpu=False):
    if gpu:
        return tensor.cuda()
    else:
        return tensor

use_gpu = torch.cuda.is_available()
print('Using gpu: %s ' % use_gpu)

In [None]:
labels_test[:5]

# Single hidden layer NN

Complete the code below, use `nn.Embedding, squeeze` and the usual `nn.Linear, F.relu, F.sigmoid, F.dropout`

In [None]:
class FirstModel(nn.Module):
    
    def __init__(self,
                 embedding_dim=30,vocab_size = 1,seq_len = 1):
        
        super(FirstModel, self).__init__()
        
        self._seq_len = seq_len
        self._embedding_dim = embedding_dim
        self._vocab_size = vocab_size

        #
        # your code here
        #
        
    def forward(self, words_id):
        
        #
        # your code here
        #

In [None]:
net1 = FirstModel(embedding_dim = 32, vocab_size = vocab_size, seq_len=seq_len)
learning_rate = 1e-3
optimizer = torch.optim.Adam(model1.parameters(),lr=learning_rate, weight_decay=0)
loss_fn = torch.nn.BCELoss()

Code the training loop and the test.

In [None]:
batch_size = 64
def minibatch_sentences(batch_size, word, sent):
    for i in range(0, len(sent), batch_size):
        yield tuple([word[i:i+batch_size,:], sent[i:i+batch_size]])
        
def shuffle_sentences(word,sent):
    random_state = np.random.RandomState()
    shuffle_indices = np.arange(len(sent))
    random_state.shuffle(shuffle_indices)
    return tuple([word[shuffle_indices,:], sent[shuffle_indices]])

def accuracy_one(x):
    return x[:,0] < 0.5

In [None]:
def test(net, word_ids, sentiment):
    net.train(False)
    word_ids = word_ids.astype(np.int64)
    word_ids_tensor = gpu(torch.from_numpy(word_ids), use_gpu)
    sent_tensor = gpu(torch.from_numpy(np.asarray(sentiment).astype(np.float32)), use_gpu)
    epoch_loss = 0.0
    epoch_acc = 0.0
    for (minibatch_num, (batch_word, batch_sent)) in enumerate(minibatch_sentences(batch_size, word_ids_tensor, sent_tensor)):
        #
        # your code here
        #

    return epoch_loss, epoch_acc

def fit(net,word_ids, sentiment, word_ids_test, sentiment_test, n_iter = 3, verbose=True):
    word_ids = word_ids.astype(np.int64)
    word_ids_test = word_ids_test.astype(np.int64)
    net.train(True)
    
    for epoch_num in range(n_iter):
        words, sents = shuffle_sentences(word_ids,np.asarray(sentiment).astype(np.float32))
        word_ids_tensor = gpu(torch.from_numpy(words), use_gpu)
        sent_tensor = gpu(torch.from_numpy(sents), use_gpu)
        epoch_loss = 0.0
        epoch_acc = 0.0
        for (minibatch_num, (batch_word, batch_sent)) in enumerate(minibatch_sentences(batch_size, word_ids_tensor, sent_tensor)):
            #
            # your code here
            #

        if verbose:
            val_loss, val_acc = test(model,word_ids_test, sentiment_test)
            print('Epoch {}: train loss {}'.format(epoch_num, epoch_loss), 'train acc', epoch_acc,'validation loss', val_loss,'validation acc', val_acc)
            model.train(True)


In [None]:
%%time
fit(net1,trn_pad, labels_train, test_pad, labels_test)

The importance of right initialization.

Implement a scaled embedding implementing [Xavier initialization](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)

In [None]:
class ScaledEmbedding(nn.Embedding):
    """
    Embedding layer that initialises its values
    to using a normal variable scaled by the inverse
    of the emedding dimension.
    """

    def reset_parameters(self):
        """
        Initialize parameters.
        """
        #
        # your code here
        #

In [None]:
%%time
fit(net1,trn_pad, labels_train, test_pad, labels_test)

# Single conv layer with max pooling



Add a first 1d-convolutional layer with maxpooling.

Here is an example of [architecture](https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/)

# GloVe Embedding

[Global Vectors for Word Representation](https://nlp.stanford.edu/projects/glove/) by  Jeffrey Pennington,   Richard Socher,   Christopher D. Manning 

In [None]:
glove_folder = '/home/lelarge/courses/data/glove/'
#glove_folder = '/home/ubuntu/data/glove/'
glove_file = glove_folder + '6B.50d.tgz'

In [None]:
#need to be done only once
#%mkdir -p $glove_folder
#!wget -O $glove_file http://files.fast.ai/models/glove/6B.50d.tgz

In [None]:
import tarfile
tar = tarfile.open(glove_file, "r:gz")
tar.extractall(glove_folder)
tar.close()

In [None]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'),encoding='latin1'))

In [None]:
def load_array(fname):
    return bcolz.open(fname)[:]

In [None]:
glove_loc =glove_folder+'6B.50d'
vecs, words, wordidx = load_vectors(glove_loc)

In [None]:
vecs.shape

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(vecs[:500])

start=0; end=350
dat = Y[start:end]
plt.figure(figsize=(15,15))
plt.scatter(dat[:, 0], dat[:, 1])
for label, x, y in zip(words[start:end], dat[:, 0], dat[:, 1]):
    plt.text(x,y,label, color=np.random.rand(3)*0.7,
                 fontsize=14)
plt.show()

In [None]:
def create_emb():
    n_fact = vecs.shape[1]
    emb = np.zeros((vocab_size, n_fact))

    for i in range(1,len(emb)):
        word = idx2word[i]
        if word and re.match(r"^[a-zA-Z0-9\-]*$", word):
            src_idx = wordidx[word]
            emb[i] = vecs[src_idx]
        else:
            # If we can't find the word in glove, randomly initialize
            emb[i] = normal(scale=0.6, size=(n_fact,))

    # This is our "rare word" id - we want to randomly initialize
    emb[-1] = normal(scale=0.6, size=(n_fact,))
    emb/=3
    return emb

In [None]:
emb = create_emb()

In [None]:
emb.shape

In [None]:
emb[1,:]