In [1]:
from platform import python_version
print('Python version:', python_version())

Python version: 3.10.6


In [2]:
!nvidia-smi

Mon Mar 20 14:29:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0  On |                  N/A |
| 30%   30C    P0    N/A /  75W |    773MiB /  4096MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import numpy as np
import tensorflow as tf
import pandas as pd
from matplotlib import pylab
import matplotlib
import matplotlib.gridspec as gridspec
%matplotlib inline
from nltk.translate.bleu_score import corpus_bleu
from sklearn.utils import shuffle
import word2vec
import nltk

In [5]:
import tensorflow.python.util.deprecation as deprecation
deprecation._PRINT_DEPRECATION_WARNINGS = False
tf.compat.v1.disable_eager_execution()

## Dataset

[Dowload](https://nlp.stanford.edu/projects/nmt/):

* English vocabulary: [`vocab.50K.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.en)

### Loading the Datasets and Building the Vocabulary

First, we build the vocabulary dictionaries for the source and target (English) language. 
The vocabularies are found in the file `vocab.50K.en`(English).

In [6]:
# Word string -> ID mapping
dictionary = dict()

vocabulary_size = len(dictionary)
with open('data/vocab.50K.en', encoding='utf-8') as f:
    for line in f:
        # disregard the new line aka `\n`
        dictionary[line[:-1]] = len(dictionary)
        
vocabulary_size = len(dictionary)
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

print('Dictionary:', list(dictionary.items())[:10], end = '\n')
print('Reverse dictionary:', list(reverse_dictionary.items())[:10], end = '\n')
print('Vocabulary size: ', vocabulary_size, end = '\n')


Dictionary: [('<unk>', 0), ('<s>', 1), ('</s>', 2), ('the', 3), (',', 4), ('.', 5), ('of', 6), ('and', 7), ('to', 8), ('in', 9)]
Reverse dictionary: [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'the'), (4, ','), (5, '.'), (6, 'of'), (7, 'and'), (8, 'to'), (9, 'in')]
Vocabulary size:  50000


### Loading data
Here we load the data from the dataset.csv file (generated in the other script)

In [7]:
dataset = pd.read_csv('data/dataset.csv')

### Data pre-processing
Transform to lower, remove the new line and the punctuation

In [8]:
wt = nltk.tokenize.WhitespaceTokenizer()

for column in dataset.columns:
    dataset[column] = dataset[column].str.lower() 
    dataset[column] = dataset[column].str.replace(',', ' ,')  \
                                     .str.replace('.',' .', regex=False)   \
                                     .str.replace('?',' ?', regex=False)   \
                                     .str.replace(')','', regex=False)   \
                                     .str.replace('(','', regex=False)   \
                                     .str.replace('"','')   \
                                     .str.replace('\n',' ')
    dataset[column] = dataset[column].apply(wt.tokenize)
dataset = shuffle(dataset)

In [9]:
dataset.head()

Unnamed: 0,question,answer
454381,"[you, can, send, a, 1gb, flash-drive, back, to...","[pictures, of, what, i, look, like, now, ., ma..."
1055609,"[what's, been, your, biggest, flop, reddit, po...","[i, was, really, proud, of, my, last, tattoo, ..."
50746,"[there, are, 10, 000, unarmed, pissed, off, ch...","[they, can't, do, harm, to, me, physically, or..."
698841,"[what, do, you, want, to, see, humanity, inven...","[long, distance, teleportation, as, in, ,, it,..."
825893,"[if, rick, from, pawn, stars, called, you, for...","[guns, or, mental, illnesses]"


### Data analysis
Mean sentence length and standard deviation of sentence length

In [10]:
print('(Questions) Average sentence length: ', dataset['question'].str.len().mean())
print('(Questions) Standard deviation of sentence length: ', dataset['question'].str.len().std())

print('(Answers) Average sentence length: ', dataset['answer'].str.len().mean())
print('(Answers) Standard deviation of sentence length: ', dataset['answer'].str.len().std())

(Questions) Average sentence length:  17.101486059545056
(Questions) Standard deviation of sentence length:  9.122891352194081
(Answers) Average sentence length:  54.367627238247216
(Answers) Standard deviation of sentence length:  843.0636308326157


### Update the sentences to fixed length
Update all sentences with a fixed size, to process the sentences as batches.

In [11]:
max_sent_length = {'question' : 30, 'answer': 70}

def padding_sent(source):
    padded = []
    for tokens in dataset[source]: 
        # adding the start token
        tokens.insert(0, '<s>')  

        if len(tokens) >= max_sent_length[source]:
            tokens = tokens[:(max_sent_length[source] - 1)]
            tokens.append('</s>')

        if len(tokens) < max_sent_length[source]:
            tokens.extend(['</s>' for _ in range(max_sent_length[source] - len(tokens))])  

        padded.append(tokens)
    return padded

In [12]:
questions = padding_sent('question')
answers = padding_sent('answer')

### Create the reverse dataset

In [13]:
def create_reverse_dataset(source):
    reverse_tokens = []
    reverse_dataset = []
    for tokens in source: 
        for token in tokens: 
            if token not in dictionary.keys():
                reverse_tokens.append(dictionary['<unk>'])
            else:
                reverse_tokens.append(dictionary[token])
        reverse_dataset.append(reverse_tokens)
        reverse_tokens = []
    return reverse_dataset

train_inputs =  np.array(create_reverse_dataset(questions), dtype=np.int32)
train_outputs =  np.array(create_reverse_dataset(answers), dtype=np.int32)

### Word Embedding

In [14]:
import word2vec

sentence_cursors = [0 for _ in range(train_inputs.shape[0])]

batch_size = 32
embedding_size = 64
steps = 80000

word2vec.define_data_and_hyperparameters(
        train_inputs.shape[0], 
        max_sent_length['question'], 
        max_sent_length['answer'], 
        dictionary, 
        reverse_dictionary,  
        train_inputs, 
        train_outputs, 
        embedding_size,
        vocabulary_size)

word2vec.print_some_batches()
word2vec.define_word2vec_tensorflow(batch_size)
word2vec.run_word2vec(batch_size, steps)



with window_size = 2:
    batch: [['<s>', 'former', 'of', '<unk>'], ['<s>', '<unk>', 'favourite', 'light'], ['<s>', 'postal', 'of', '<unk>'], ['<s>', 'as', 'kid', ','], ['<s>', 'you', 'throw', '10000'], ['<s>', 'how', 'you', 'feel'], ['<s>', 'where', 'you', 'put'], ['<s>', 'when', 'the', 'absolute']]
    labels: ['smokers', 'your', 'workers', 'a', 'can', 'would', 'do', 'was']
Defining 4 embedding lookups representing each word in the context
Stacked embedding size: [32, 64, 4]
Reduced mean embedding size: [32, 64]


2023-03-20 14:31:47.258978: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-20 14:31:47.306573: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-20 14:31:47.369285: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-20 14:31:47.369851: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

Initialized
Average loss at step 2000: 2.779788
Average loss at step 4000: 1.444825
Average loss at step 6000: 1.281724
Average loss at step 8000: 1.185413
Average loss at step 10000: 1.129574
Nearest to should: did, constituent, would, husbands, Derrida, does, well, greetings,
Nearest to -: ,, women, instead, fellow, cooled, cops, housed, Bible,
Nearest to also: parents, dopo, deceiving, CAM, Elche, S.p.A., walked, leases,
Nearest to The: Alejandro, presses, pioneered, Absolute, Berlaymont, devoid, Excellent, multiple,
Nearest to these: 1821, sparking, satisfying, MacDonald, www.avaaz.org, Jo, collateral, Rhapsody,
Nearest to us: <unk>, worst, for, indemnify, longest, apple, alright, ridiculous,
Nearest to out: sleep, indelible, prayer, fell, live, all, vs, meu,
Nearest to or: Poos, biocidal, parody, Multitude, object, decides, Immanuel, Link,
Nearest to by: Advocates, GfK, Sealed, ERC, 3.7, jour, 6.30, tabling,
Nearest to as: video, movie, person, Procchio, girl, scale, song, weird,


Average loss at step 52000: 1.066826
Average loss at step 54000: 1.067815
Average loss at step 56000: 1.074104
Average loss at step 58000: 1.076034
Average loss at step 60000: 1.071978
Nearest to should: can, would, did, could, will, does, detailled, Vejer,
Nearest to -: ,, :, finances, nightclub, coaches, Maritim, hillside, 80th,
Nearest to also: sirens, oral, involved, arising, commentaries, clic, visited, dopo,
Nearest to The: Alejandro, presses, pioneered, colonized, Absolute, devoid, Berlaymont, Excellent,
Nearest to these: Doctor, F1, Along, 0044, contravention, 50, Ideally, Linguistic,
Nearest to us: usa, insbesondere, handicraft, emotional, jealousy, tumours, Friesland, indemnify,
Nearest to out: Documentation, 8, soir, Exif, Byzantine, indelible, fell, tempted,
Nearest to or: Poos, Valencia, DVI, impatience, Multitude, Payments, Nueva, zero,
Nearest to by: Advocates, on, GfK, dismayed, predefined, data, realistic, daran,
Nearest to as: Orion, Eugen, rebound, Teheran, CHILD, bi

In [20]:
class DataGenerator(object):

    def __init__(self, batch_size, num_unroll, is_input, is_train):
        self._batch_size = batch_size
        self._num_unroll = num_unroll
        self._cursor = [0 for offset in range(self._batch_size)]
        self._word_embeddings = np.load('embeddings.npy')
        self._sent_ids = None
        self._is_input = is_input
        self._is_train = is_train

    def next_batch(self, sent_ids):

        sent_length = max_sent_length['question'] if self._is_input else max_sent_length['answer']

        batch_data = np.zeros((self._batch_size, embedding_size),dtype=np.float32)
        batch_labels = np.zeros((self._batch_size,embedding_size),dtype=np.float32)

        for batch in range(self._batch_size):
            sent_id = sent_ids[batch]
            
            if self._is_input:
                sent_text = train_inputs[sent_id] if self._is_input else test_inputs[sent_id]
            else:
                sent_text = train_outputs[sent_id] if self._is_input else train_outputs[sent_id]
            
            batch_data[batch] = self._word_embeddings[sent_text[self._cursor[batch]],:]
            batch_labels[batch] = np.zeros((vocabulary_size),dtype=np.float32)
            batch_labels[batch,sent_text[self._cursor[batch]+1]] = 1.0

            self._cursor[b] = (self._cursor[b]+1)%(max_sent_length-1)

        return batch_data,batch_labels

    def unroll_batches(self,sent_ids):

        if sent_ids is not None:
            self._sent_ids = sent_ids
            self._cursor = [0 for _ in range(self._batch_size)]
        unroll_data, unroll_labels = [],[]

        for unroll_ids in range(self._num_unroll):
            data, labels = self.next_batch(self._sent_ids)
            unroll_data.append(data)
            unroll_labels.append(labels)
        return unroll_data, unroll_labels, self._sent_ids

    def reset_indices(self):
        self._cursor = [0 for offset in range(self._batch_size)]

dg = DataGenerator(batch_size=5, num_unroll=20, is_input=True, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])

print('Source data')
for _, lbl in zip(u_data,u_labels):
    print([reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

dg = DataGeneratorMT(batch_size=5, num_unroll=30, is_input=False, is_train=True)
u_data, u_labels, _ = dg.unroll_batches([0,1,2,3,4])
print('\nTarget data batch')
for d_i,(_, lbl) in enumerate(zip(u_data,u_labels)):
    print([tgt_reverse_dictionary[w] for w in np.argmax(lbl,axis=1).tolist()])

AttributeError: 'DataGenerator' object has no attribute 'is_input'