# Neural Machine Translation with Bahdanau Attention Mechanism

You will build a Neural Machine Translation (NMT) model to translate human readable dates ("25th of June, 2009") into machine readable dates ("2009-06-25"). You will do this using an attention model, one of the most sophisticated sequence to sequence models.  

In [0]:
import numpy as np
try:
    from faker import Faker
except ImportError:
    !pip install faker
    from faker import Faker
from babel.dates import format_date
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import random
from tqdm import tqdm
import matplotlib.pyplot as plt

fake = Faker()
Faker.seed(12345)
random.seed(12345)

# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

# change this if you want it to work with another language
LOCALES = ['en_US']

def load_date():
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()
        
    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

def load_dataset(m, model="bahdanau"):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    assert model in ["bahdanau", "luong"], "Either `bahdanau` or `luong`."

    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30
    

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m if model == "bahdanau" else "#" + m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(['<pad>', '<unk>'] + sorted(human_vocab), 
                     list(range(len(human_vocab) + 2))))
    machine_vocab = sorted(machine_vocab)
    if model == "luong":
        machine_vocab = ["#"] + machine_vocab
    inv_machine = dict(enumerate(machine_vocab))
    machine = {v:k for k,v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

def load_dataset_v2(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30
    

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, "#" + m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(['<pad>', '<unk>'] + sorted(human_vocab), 
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(["#"] + sorted(machine_vocab)))
    machine = {v:k for k,v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    
    X, Y = zip(*dataset)
    
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = np.array([string_to_int(t, Ty, machine_vocab) for t in Y])

    return X, Y

def string_to_int(string, length, vocab):
    """
    Converts all strings in the vocabulary into a list of integers representing the positions of the
    input string's characters in the "vocab"
    
    Arguments:
    string -- input string, e.g. 'Wed 10 Jul 2007'
    length -- the number of time steps you'd like, determines if the output will be padded or cut
    vocab -- vocabulary, dictionary used to index every character of your "string"
    
    Returns:
    rep -- list of integers (or '<unk>') (size = length) representing the position of the string's character in the vocabulary
    """
    
    #make lower to standardize
    string = string.lower()
    string = string.replace(',','')
    
    if len(string) > length:
        string = string[:length]
        
    rep = list(map(lambda x: vocab.get(x, '<unk>'), string))
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    #print (rep)
    return rep


def int_to_string(ints, inv_vocab):
    """
    Output a machine readable list of characters based on a list of indexes in the machine's vocabulary
    
    Arguments:
    ints -- list of integers representing indexes in the machine's vocabulary
    inv_vocab -- dictionary mapping machine readable indexes to machine readable characters 
    
    Returns:
    l -- list of characters corresponding to the indexes of ints thanks to the inv_vocab mapping
    """
    
    l = [inv_vocab[i] for i in ints]
    return l


EXAMPLES = ['3 May 1979', '5 Apr 09', '20th February 2016', 'Wed 10 Jul 2007']

def run_example(model, input_vocabulary, inv_output_vocabulary, text):
    encoded = string_to_int(text, TIME_STEPS, input_vocabulary)
    prediction = model.predict(np.array([encoded]))
    prediction = np.argmax(prediction[0], axis=-1)
    return int_to_string(prediction, inv_output_vocabulary)

def run_examples(model, input_vocabulary, inv_output_vocabulary, examples=EXAMPLES):
    predicted = []
    for example in examples:
        predicted.append(''.join(run_example(model, input_vocabulary, inv_output_vocabulary, example)))
        print('input:', example)
        print('output:', predicted[-1])
    return predicted

def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]

## 1 - Translating human readable dates into machine readable dates

The model you will build here could be used to translate from one language to another, such as translating from English to Hindi. However, language translation requires massive datasets and usually takes days of training on GPUs. To give you a place to experiment with these models even without using massive datasets, we will instead use a simpler "date translation" task. 

The network will input a date written in a variety of possible formats (*e.g. "the 29th of August 1958", "03/30/1968", "24 JUNE 1987"*) and translate them into standardized, machine readable dates (*e.g. "1958-08-29", "1968-03-30", "1987-06-24"*). We will have the network learn to output dates in the common machine-readable format YYYY-MM-DD. 



<!-- 
Take a look at [nmt_utils.py](./nmt_utils.py) to see all the formatting. Count and figure out how the formats work, you will need this knowledge later. !--> 

### 1.1 - Dataset

We will train the model on a dataset of 60000 human readable dates and their equivalent, standardized, machine readable dates. Let's run the following cells to load the dataset and print some examples. 

In [0]:
m = 60000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████| 60000/60000 [00:03<00:00, 16478.85it/s]


In [0]:
dataset[:10]

[('9 may 1998', '1998-05-09'),
 ('10.11.19', '2019-11-10'),
 ('9/10/70', '1970-09-10'),
 ('saturday april 28 1990', '1990-04-28'),
 ('thursday january 26 1995', '1995-01-26'),
 ('monday march 7 1983', '1983-03-07'),
 ('sunday may 22 1988', '1988-05-22'),
 ('08 jul 2008', '2008-07-08'),
 ('8 sep 1999', '1999-09-08'),
 ('thursday january 1 1981', '1981-01-01')]

In [0]:
human_vocab

{' ': 2,
 '.': 3,
 '/': 4,
 '0': 5,
 '1': 6,
 '2': 7,
 '3': 8,
 '4': 9,
 '5': 10,
 '6': 11,
 '7': 12,
 '8': 13,
 '9': 14,
 '<pad>': 0,
 '<unk>': 1,
 'a': 15,
 'b': 16,
 'c': 17,
 'd': 18,
 'e': 19,
 'f': 20,
 'g': 21,
 'h': 22,
 'i': 23,
 'j': 24,
 'l': 25,
 'm': 26,
 'n': 27,
 'o': 28,
 'p': 29,
 'r': 30,
 's': 31,
 't': 32,
 'u': 33,
 'v': 34,
 'w': 35,
 'y': 36}

In [0]:
machine_vocab

{'-': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10}

You've loaded:
- `dataset`: a list of tuples of (human readable date, machine readable date)
- `human_vocab`: a python dictionary mapping all characters used in the human readable dates to an integer-valued index 
- `machine_vocab`: a python dictionary mapping all characters used in machine readable dates to an integer-valued index. These indices are not necessarily consistent with `human_vocab`. 
- `inv_machine_vocab`: the inverse dictionary of `machine_vocab`, mapping from indices back to characters. 

Let's preprocess the data and map the raw text data into the index values. We will also use Tx=30 (which we assume is the maximum length of the human readable date; if we get a longer input, we would have to truncate it) and Ty=10 (since "YYYY-MM-DD" is 10 characters long). 

In [0]:
Tx = 30
Ty = 10

X, Y = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

Y = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)

X.shape: (60000, 30)
Y.shape: (60000, 10, 11)


## 2 - Neural machine translation with Bahdanau Attention

If you had to translate a book's paragraph from French to English, you would not read the whole paragraph, then close the book and translate. Even during the translation process, you would read/re-read and focus on the parts of the French paragraph corresponding to the parts of the English you are writing down. 

The attention mechanism tells a Neural Machine Translation model where it should pay attention to at any step. 


### 2.1 - Bahdanau Attention mechanism

In this part, you will implement the attention mechanism presented in the lecture videos. Here is a figure to remind you how the model works. The diagram on the left shows the attention model. The diagram on the right shows what one "Attention" step does to calculate the attention variables $\alpha^{\langle t, t' \rangle}$, which are used to compute the context variable $context^{\langle t \rangle}$ for each timestep in the output ($t=1, \ldots, T_y$). 

<table>
<td> 
<img src="https://i.imgur.com/fuOZgQl.png" style="width:500;height:500px;"> <br>
</td> 
<td> 
<img src="https://i.imgur.com/CEgMHFc.png" style="width:500;height:500px;"> <br>
</td> 
</table>
<caption><center> **Figure 1**: Neural machine translation with Bahdanau Attention</center></caption>


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
class BahdanauAttention(nn.Module):
    
    def __init__(self, num_embeds, embed_dim, encoder_units, decoder_units, enc_len, dec_len, num_vocab):
        """
        decoder_units = 2*encoder_units because bidirectional of the encoder
        """
        super().__init__()
        self.word_embed = nn.Embedding(num_embeddings=num_embeds, embedding_dim=embed_dim)
        self.encoder_lstm = nn.LSTM(input_size=embed_dim, hidden_size=encoder_units, batch_first=True, bidirectional=True)
        self.ffn1 = nn.Linear(in_features=encoder_units*2 + decoder_units, out_features=decoder_units)
        self.ffn2 = nn.Linear(in_features=decoder_units, out_features=1)
        self.decoder_lstm = nn.LSTM(input_size=decoder_units, hidden_size=decoder_units, batch_first=True)
        self.enc_len = enc_len
        self.dec_len = dec_len
        self.logits = nn.Linear(in_features=decoder_units, out_features=num_vocab)

    def forward(self, x, init_states):
        x = self.word_embed(x)
        h_enc_seq ,_ = self.encoder_lstm(x) # LSTM bidirectional, shape = (batch_size, enc_len, encoder_units)
        h_dec, c_dec = init_states # shape = (batch_size, decoder_units)
        h_dec = h_dec.unsqueeze(0)
        c_dec = c_dec.unsqueeze(0)
        outputs = []
        for i in range(self.dec_len):
            h_dec_seq = torch.cat([h_dec.squeeze(0).unsqueeze(1)]*self.enc_len, dim=1)
            x = torch.cat([h_dec_seq, h_enc_seq], dim=-1) # shape = (batch_size, enc_len, encoder_units + decoder_units)
            x = self.ffn1(x) # shape = (batch_size, enc_len, decoder_units)
            x = torch.tanh(x)
            x = self.ffn2(x).squeeze(-1)  # shape = (batch_size, enc_len)
            x = torch.softmax(x, dim=-1)

            context = torch.einsum("nse,ns->ne", h_enc_seq, x) # shape = (batch_size, encoder_units)
            context = torch.unsqueeze(context, 1)
            out, (h_dec, c_dec) = self.decoder_lstm(context, (h_dec, c_dec)) # shape = (batch_size, 1, decoder_units)
            out = self.logits(out.squeeze(1)) # shape = (batch_size, num_vocab)
            out = torch.softmax(out, dim=-1)
            outputs.append(out.unsqueeze(1))

        outputs = torch.cat(outputs, dim=1)
        return outputs

In [0]:
NUM_EMBEDS = len(human_vocab)
EMBED_DIM = 32
ENCODER_UNITS = 32
DECODER_UNITS = ENCODER_UNITS * 2
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Load the model

In [0]:
bahdanau = BahdanauAttention(num_embeds=NUM_EMBEDS, embed_dim=EMBED_DIM, encoder_units=ENCODER_UNITS, decoder_units=DECODER_UNITS, enc_len=Tx, dec_len=Ty, num_vocab=len(machine_vocab)).to(DEVICE)

In [0]:
bahdanau.parameters

<bound method Module.parameters of BahdanauAttention(
  (word_embed): Embedding(37, 32)
  (encoder_lstm): LSTM(32, 32, batch_first=True, bidirectional=True)
  (ffn1): Linear(in_features=128, out_features=64, bias=True)
  (ffn2): Linear(in_features=64, out_features=1, bias=True)
  (decoder_lstm): LSTM(64, 64, batch_first=True)
  (logits): Linear(in_features=64, out_features=11, bias=True)
)>

In [0]:
def loss_func(Y_hat, Y):
    res = torch.sum(Y*torch.log(Y_hat), dim=-1)
    res = torch.sum(res, dim=-1)
    return torch.mean(-res)

optimizer = torch.optim.Adam(bahdanau.parameters(), lr=0.001, betas=(0.9, 0.999))

In [0]:
BATCH_SIZE = 64
EPOCHS = 10
num_batches = int(np.ceil(m / 64))

X = torch.Tensor(X).long().to(DEVICE)
Y = torch.Tensor(Y).long().to(DEVICE)

In [0]:
for e in range(EPOCHS):
    indices = torch.randperm(m)
    X = X[indices]
    Y = Y[indices]
    epoch_loss = 0.0

    for b in range(num_batches):
        X_batch = X[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        h, c = torch.zeros(X_batch.shape[0], DECODER_UNITS).to(DEVICE), torch.zeros(X_batch.shape[0], DECODER_UNITS).to(DEVICE)
        Y_hat = bahdanau(X_batch, (h, c))
        Y_batch = Y[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        batch_loss = loss_func(Y_hat, Y_batch)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        epoch_loss += batch_loss.item()

    print("Loss at epoch %d: %.3f" % (e, epoch_loss/num_batches))

Loss at epoch 0: 8.398
Loss at epoch 1: 0.335
Loss at epoch 2: 0.051
Loss at epoch 3: 0.031
Loss at epoch 4: 0.099
Loss at epoch 5: 0.031
Loss at epoch 6: 0.022
Loss at epoch 7: 0.019
Loss at epoch 8: 0.017
Loss at epoch 9: 0.017


You can now see the results on new examples.

In [0]:
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']
bahdanau.eval()
for example in EXAMPLES:
    source = string_to_int(example, Tx, human_vocab)
    source = torch.Tensor(np.array([source])).long().to(DEVICE)
    h, c = torch.zeros(1, DECODER_UNITS).to(DEVICE), torch.zeros(1, DECODER_UNITS).to(DEVICE)
    prediction = bahdanau(source, (h, c))
    prediction = torch.argmax(prediction, dim=-1)
    prediction = prediction.cpu().numpy()[-1]
    output = [inv_machine_vocab[int(i)] for i in prediction]
    
    print("source:", example)
    print("output:", ''.join(output))

source: 3 May 1979
output: 1979-05-03
source: 5 April 09
output: 2009-04-05
source: 21th of August 2016
output: 2016-08-12
source: Tue 10 Jul 2007
output: 2007-07-10
source: Saturday May 9 2018
output: 2018-05-09
source: March 3 2001
output: 2001-03-03
source: March 3rd 2001
output: 2001-03-00
source: 1 March 2001
output: 2001-03-01


### 2.2 Luong attention mechanism
<img src="https://i.imgur.com/46R4XQV.png" style="width:500;height:500px;"> <br>

<caption><center> Luong attention mechanism</center></caption>

<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg">

<img src="https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg">

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
m = 60000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m, "luong")

100%|██████████| 60000/60000 [00:02<00:00, 22962.40it/s]


In [0]:
Tx = 30
Ty = 10

X, Y = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty+1)
Y_input = Y[:, :-1]
Y_target = Y[:, 1:]
Y_target = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y_target)))

print("X.shape:", X.shape)
print("Y_input.shape:", Y_input.shape)
print("Y_target.shape:", Y_target.shape)

X.shape: (60000, 30)
Y_input.shape: (60000, 10)
Y_target.shape: (60000, 10, 12)


In [0]:
class Luong(nn.Module):

    def __init__(self, len_vocab, embed_size, lstm_units):
        super().__init__()
        self.len_vocab = len_vocab
        self.embed_size = embed_size
        self.lstm_units = lstm_units
        self.word_embed = nn.Embedding(num_embeddings=len_vocab, embedding_dim=embed_size)
        self.lstm = nn.LSTM(input_size=embed_size, hidden_size=lstm_units, batch_first=True)

class LuongEncoder(Luong):
    
    def forward(self, x):
        x = self.word_embed(x)
        out, hc_states = self.lstm(x)
        return out, hc_states

class LuongDecoder(Luong):

    def forward(self, x, hidden_states=None):
        x = self.word_embed(x)
        out, hc_states = self.lstm(x, hidden_states)
        return out, hc_states

class LuongAttention(nn.Module):

    def __init__(self, encoder, decoder, enc_len, dec_len):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.enc_len = enc_len
        self.dec_len = dec_len
        self.ffn1 = nn.Linear(in_features=self.encoder.lstm_units + self.decoder.lstm_units, out_features=self.decoder.lstm_units)
        self.ffn2 = nn.Linear(in_features=self.decoder.lstm_units, out_features=1)
        self.attention = nn.Linear(in_features=self.encoder.lstm_units + self.decoder.lstm_units, out_features=self.decoder.lstm_units)
        self.logits = nn.Linear(in_features=self.decoder.lstm_units, out_features=self.decoder.len_vocab)

    def forward(self, x, y):
        """
        x: input of encoder, shape = (batch_size, enc_len, num_enc_vocab)
        y: input of decoder, shape = (batch_size, dec_len, num_dec_vocab)
        """
        encoder_outputs, hidden_states = self.encoder(x) # encoder_outputs shape = (batch_size, enc_len, enc_units), unidirectional
        decoder_outputs, _ = self.decoder(y, hidden_states) # decoder_outputs shape = (batch_size, dec_len, dec_units)

        x = torch.cat([encoder_outputs.unsqueeze(2)]*self.dec_len, dim=2) # x shape = (batch_size, enc_len, dec_len, enc_units)
        y = torch.cat([decoder_outputs.unsqueeze(1)]*self.enc_len, dim=1) # y shape = (batch_size, enc_len, dec_len, dec_units)

        score = torch.cat([x, y], dim=-1)
        score = self.ffn1(score)
        score = torch.tanh(score)
        score = self.ffn2(score).squeeze(3) # score shape = (batch_size, enc_len, dec_len)
        
        alpha = F.softmax(score, dim=1) # alpha shape = (batch_size, enc_len, dec_len)
        context = torch.einsum("bst,bse->bte", alpha, encoder_outputs) # context shape = (batch_size, dec_len, enc_units)
        concat = torch.cat([context, decoder_outputs], dim=-1) # concat shape = (batch_size, dec_len, enc_units + dec_units)
        
        att = self.attention(concat)
        att = torch.tanh(att) # att shape = (batch_size, dec_len, dec_units)

        out = self.logits(att)
        out = F.softmax(out, dim=-1) # out shape = (batch_size, dec_len, num_dec_vocab)

        return out

In [0]:
NUM_EMBEDS = len(human_vocab)
EMBED_DIM = 32
ENCODER_UNITS = 32
DECODER_UNITS = ENCODER_UNITS
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [0]:
encoder = LuongEncoder(len_vocab=len(human_vocab), embed_size=EMBED_DIM, lstm_units=ENCODER_UNITS)
decoder = LuongDecoder(len_vocab=len(machine_vocab), embed_size=EMBED_DIM, lstm_units=DECODER_UNITS)

luong = LuongAttention(encoder, decoder, Tx, Ty).to(DEVICE)

In [0]:
luong.parameters

<bound method Module.parameters of LuongAttention(
  (encoder): LuongEncoder(
    (word_embed): Embedding(37, 32)
    (lstm): LSTM(32, 32, batch_first=True)
  )
  (decoder): LuongDecoder(
    (word_embed): Embedding(12, 32)
    (lstm): LSTM(32, 32, batch_first=True)
  )
  (ffn1): Linear(in_features=64, out_features=32, bias=True)
  (ffn2): Linear(in_features=32, out_features=1, bias=True)
  (attention): Linear(in_features=64, out_features=32, bias=True)
  (logits): Linear(in_features=32, out_features=12, bias=True)
)>

In [0]:
def loss_func(Y_hat, Y):
    res = torch.sum(Y*torch.log(Y_hat), dim=-1)
    res = torch.sum(res, dim=-1)
    return torch.mean(-res)

optimizer = torch.optim.Adam(luong.parameters(), lr=0.005, betas=(0.9, 0.999))

In [0]:
BATCH_SIZE = 64
EPOCHS = 10
num_batches = int(np.ceil(m / 64))

X = torch.Tensor(X).long().to(DEVICE)
Y_input = torch.Tensor(Y_input).long().to(DEVICE)
Y_target = torch.Tensor(Y_target).long().to(DEVICE)

In [0]:
for e in range(EPOCHS):
    indices = torch.randperm(m)
    X = X[indices]
    Y = Y[indices]
    epoch_loss = 0.0

    for b in range(num_batches):
        X_batch = X[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        Y_input_batch = Y_input[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        Y_target_batch = Y_target[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        Y_hat = luong(X_batch, Y_input_batch).to(DEVICE)
        batch_loss = loss_func(Y_hat, Y_target_batch)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        epoch_loss += batch_loss.item()

    print("Loss at epoch %d: %.3f" % (e, epoch_loss/num_batches))

Loss at epoch 0: 10.255
Loss at epoch 1: 9.874
Loss at epoch 2: 9.862
Loss at epoch 3: 9.857
Loss at epoch 4: 9.856
Loss at epoch 5: 9.853
Loss at epoch 6: 9.851
Loss at epoch 7: 9.851
Loss at epoch 8: 9.851
Loss at epoch 9: 9.850
