In [1]:
from platform import python_version
print('Python version:', python_version())

Python version: 3.10.6


In [None]:
!nvidia-smi

In [14]:
import numpy as np
import tensorflow as tf
import pandas as pd
from matplotlib import pylab
import matplotlib
import matplotlib.gridspec as gridspec
%matplotlib inline
from nltk.translate.bleu_score import corpus_bleu
from sklearn.utils import shuffle
from gensim.models.word2vec import Word2Vec
import nltk

## Dataset

[Dowload](https://nlp.stanford.edu/projects/nmt/):

* English vocabulary: [`vocab.50K.en`](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/vocab.50K.en)

### Loading the Datasets and Building the Vocabulary

First, we build the vocabulary dictionaries for the source and target (English) language. 
The vocabularies are found in the file `vocab.50K.en`(English).

In [3]:
# Word string -> ID mapping
dictionary = dict()

vocabulary_size = len(dictionary)
with open('data/vocab.50K.en', encoding='utf-8') as f:
    for line in f:
        # disregard the new line aka `\n`
        dictionary[line[:-1]] = len(dictionary)
        
vocabulary_size = len(dictionary)
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

print('Dictionary:', list(dictionary.items())[:10], end = '\n')
print('Reverse dictionary:', list(reverse_dictionary.items())[:10], end = '\n')
print('Vocabulary size: ', vocabulary_size, end = '\n')


Dictionary: [('<unk>', 0), ('<s>', 1), ('</s>', 2), ('the', 3), (',', 4), ('.', 5), ('of', 6), ('and', 7), ('to', 8), ('in', 9)]
Reverse dictionary: [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'the'), (4, ','), (5, '.'), (6, 'of'), (7, 'and'), (8, 'to'), (9, 'in')]
Vocabulary size:  50000


### Loading data
Here we load the data from the dataset.csv file (generated in the other script)

In [4]:
dataset = pd.read_csv('data/dataset.csv')

### Data pre-processing
Transform to lower, remove the new line and the punctuation

In [5]:
wt = nltk.tokenize.WhitespaceTokenizer()

for column in dataset.columns:
    dataset[column] = dataset[column].str.lower() 
    dataset[column] = dataset[column].str.replace(',', ' ,')  \
                                     .str.replace('.',' .')   \
                                     .str.replace('?',' ?')   \
                                     .str.replace('\n',' ')
    dataset[column] = dataset[column].apply(wt.tokenize)
dataset = shuffle(dataset)

  .str.replace('.',' .')   \
  .str.replace('?',' ?')   \


In [6]:
dataset.head()

Unnamed: 0,question,answer
155936,"[men, of, reddit, ,, do, you, trim, your, armp...","[yes, ., it's, less, stinky, and, i, think, it..."
587712,"[redditors, ,, what, is, one, thing, you, are,...","[""i've, never, been, much, for, sports, but, o..."
1113971,"[hikers, and, campers, of, reddit, ,, what, is...","[my, brother, and, i, were, out, in, a, three-..."
222731,"[reddit, parents:, what, do, do, when, a, care...","[""tell, him, you, don't, want, your, kid, span..."
1061648,"[is, there, a, commercial, that, you, hate, so...","[education, connection, ., i, fucking, hate, t..."


### Data analysis
Mean sentence length and standard deviation of sentence length

In [None]:
print('(Questions) Average sentence length: ', dataset['question'].str.len().mean())
print('(Questions) Standard deviation of sentence length: ', dataset['question'].str.len().std())

print('(Answers) Average sentence length: ', dataset['answer'].str.len().mean())
print('(Answers) Standard deviation of sentence length: ', dataset['answer'].str.len().std())

### Update the sentences to fixed length
Update all sentences with a fixed size, to process the sentences as batches.

In [None]:
# maximum sentence length
max_sent_length = {'question' : 30, 'answer': 70}

for column in dataset.columns:
    for tokens in dataset[column]: 
        
        # adding the start token
        tokens.insert(0, '<s>')
        
        if len(tokens) >= max_sent_length[column]:
            tokens = tokens[:max_sent_length[column] - 1]
            tokens.append('</s>')
            
        if len(tokens) < max_sent_length[column]:
            tokens.extend(['</s>' for _ in range(max_sent_length[column] - len(tokens))])

In [15]:
model = Word2Vec(alldata,
                 sg=1,           
                 window=3,       
                 min_count=1,     
                 workers=4,       
                 iter=1)        
model.init_sims(replace=True) 

TypeError: Word2Vec.__init__() got an unexpected keyword argument 'iter'