# Modul Spezielle Anwendungen der Informatik: K.I. in der Robotik

## Projektpräsentation: Sequenzmodelle in PyTorch am Beispiel eines simplen LSTM-Maschinenübersetzers

## 1. Modellarchitektur

In [70]:
### Framework imports
import torch
from torch import optim
import os
import random

### Custom imports 
from model.model import *
from experiment.train_eval import evaluateInput, GreedySearchDecoder, trainIters, eval_batch, plot_training_results
from global_settings import device, FILENAME, SAVE_DIR, PREPRO_DIR, TRAIN_FILE, TEST_FILE, EXPERIMENT_DIR, LOG_FILE
from model.model import EncoderLSTM, DecoderLSTM
from utils.prepro import read_lines, preprocess_pipeline, load_cleaned_data, save_clean_data
from utils.tokenize import build_vocab, batch2TrainData, indexesFromSentence

from global_settings import DATA_DIR
from utils.utils import split_data, filter_pairs, max_length, plot_grad_flow

In [30]:
### Data cleaning
start_root = "."
exp_contraction = True # don't --> do not
file_to_load = "simple_dataset_praesi.txt"
file_name = "simple_dataset_praesi.pkl"


if os.path.isfile(os.path.join(start_root, PREPRO_DIR,file_name)):
    ##load
    print("File exists. Loading cleaned pairs...")
    pairs = load_cleaned_data(PREPRO_DIR, filename=cleaned_file)
else: 
    print("Preprocessing file...")
    ### read lines from file
    pairs = read_lines(os.path.join(start_root,DATA_DIR),file_to_load)
    ### Preprocess file
    pairs, path = preprocess_pipeline(pairs, file_name, exp_contraction, max_len = 0)

File exists. Loading cleaned pairs...


In [37]:
print(random.choice(pairs))
print("Total pairs in the small dataset:")
print(len(pairs))

max_src_l = max_length(src_sents)
max_trg_l = max_length(trg_sents)

print("Max length in source sentences:", max_src_l)
print("Max length in target sentences:", max_trg_l)

['stop', 'stopp']
Total pairs in the small dataset:
100
Max length in source sentences: [3]
Max length in target sentences: [5]


In [36]:
### Getting src and trg sents
src_sents, trg_sents = [], []
src_sents = [item[0] for item in pairs]
trg_sents = [item[1] for item in pairs]
print(random.choice(src_sents))
print(random.choice(trg_sents))

beat it
macht schon


In [48]:
### Creating vocabularies
input_lang = build_vocab(src_sents, "eng")
output_lang = build_vocab(trg_sents, "deu")

print("Total source words:", input_lang.num_words)
print("Total target words:", output_lang.num_words)

print("Example of conversion word > index:")
print("Word {} > Index {}".format('hello', input_lang.word2index.get('hello')))
print("Index {} > Word {}".format(20, input_lang.index2word.get(20)))

Total source words: 55
Total target words: 125
Example of conversion word > index:
Word hello > Index 13
Index 20 > Word cheers


In [68]:
### Simple conversion sentence to tensor:
random_pair = train_pairs[40]
print(random_pair)

['i paid', 'ich zahlte']


In [73]:
english_sent = indexesFromSentence(input_lang, random_pair[0])
german_sent = indexesFromSentence(output_lang, random_pair[1])

print(english_sent)
print(german_sent)

[14, 33, 2]
[18, 56, 2]


In [87]:
### No splitting for this short presentation :-)
train_pairs = pairs
mini_batch = 5
batch_pair = [random.choice(train_pairs) for _ in range(5)]
batch_pair.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
for pair in batch_pair:
    print("Source:", pair[0],"Target:", pair[1])    
    print("Src tensor:", indexesFromSentence(input_lang, pair[0]),"Trg tensor:", indexesFromSentence(output_lang, pair[1]))    

Source: beat it Target: schwirr ab
Src tensor: [49, 23, 2] Trg tensor: [113, 98, 2]
Source: get tom Target: hol tom
Src tensor: [52, 44, 2] Trg tensor: [123, 84, 2]
Source: beat it Target: scher dich weg
Src tensor: [49, 23, 2] Trg tensor: [116, 89, 96, 2]
Source: really Target: echt
Src tensor: [40, 2] Trg tensor: [77, 2]
Source: wow Target: donnerwetter
Src tensor: [6, 2] Trg tensor: [9, 2]


In [88]:
### Creating a simple batch of 5 sentences --> Shape (seq_len, batch_size)
training_batch = batch2TrainData(input_lang, output_lang, batch_pair)

In [89]:
input_tensor, input_lengths, target_tensor, mask, target_max_len, target_lengths = training_batch

In [90]:
print("Length of source sentences:", input_lengths)

Length of source sentences: tensor([3, 3, 3, 2, 2])


In [91]:
print("Tensorized input:")
print(input_tensor)

Tensorized input:
tensor([[49, 52, 49, 40,  6],
        [23, 44, 23,  2,  2],
        [ 2,  2,  2,  0,  0]])


In [92]:
print("Tensorized output:")
print(target_tensor)

Tensorized output:
tensor([[113, 123, 116,  77,   9],
        [ 98,  84,  89,   2,   2],
        [  2,   2,  96,   0,   0],
        [  0,   0,   2,   0,   0]])


## 2. Encoding - Decoding Verfahren:

Sowohl Encoder als auch Decoder greifen auf das erste Index zu, sprich die Eingaben nicht über die batch_size Dimension verarbeitet, sondern als Sequenz nach ihrer Sequenzlänge verarbeitet, wie folgt:

In [95]:
### Das bekommt das Encoder bzw. Decoder zu jedem Zeitschritt t:
for i, elem in enumerate(input_tensor):
    print("Timestep:", i)
    print("Input:", elem)
    print("Woerter:", [input_lang.index2word[word.item()] for word in elem])


Timestep: 0
Input: tensor([49, 52, 49, 40,  6])
Woerter: ['beat', 'get', 'beat', 'really', 'wow']
Timestep: 1
Input: tensor([23, 44, 23,  2,  2])
Woerter: ['it', 'tom', 'it', '<EOS>', '<EOS>']
Timestep: 2
Input: tensor([2, 2, 2, 0, 0])
Woerter: ['<EOS>', '<EOS>', '<EOS>', '<PAD>', '<PAD>']


In [96]:
### Genauso im Decoder


## 3. Übersetzen 