# TP 4: Training a Feedforward neural network
Master LiTL - 2021-2022

## Requirements
In this section, we will investigate variations of the setting and the hyper-parameter values of a feedforward NN, still on sentiment analysis on a French dataset of reviews.
Our goal is to find the best model for the task, we will thus make use of the development set this time! 

We will explore:    
* varied architectures
* varied optimizers
* varied activation functions
* varied values for the hyper-parameters

And in the part 2:
* varied representation: sparse and continuous bag-of-words
* Add plots of cost function
* Add plots of number of training examples 

In [None]:
import torch

# If you’re using Colab, allocate a GPU by going to Edit > Notebook Settings.
# We move our tensor to the GPU if available
if torch.cuda.is_available():
  print(f"GPU ok")
else:
  print("no gpu")

GPU ok



## 1. Code for running a FFNN

### 1.1 Read the data

The code below is the same as last time: the input is a BoW representation.

In [None]:
import pandas as pd
import numpy as np
import re
import sklearn
import torch
from torch.utils.data import TensorDataset, DataLoader

from sklearn.feature_extraction.text import CountVectorizer

train_path = "allocine_train.tsv"
dev_path = "allocine_dev.tsv"
test_path = "allocine_test.tsv"

# This will be the size of the vectors reprensenting the input
MAX_FEATURES = 5000 

# Load train set
train_df = pd.read_csv(train_path, header=0, delimiter="\t", quoting=3)
    
# -- VECTORIZE
print("Creating features from bag of words...")  
vectorizer = CountVectorizer( analyzer = "word", max_features = MAX_FEATURES ) 
train_data_features = vectorizer.fit_transform(train_df["review"])
# -- TO DENSE
x_train = train_data_features.toarray()
y_train = np.asarray(train_df["sentiment"])
print( "TRAIN:", x_train.shape )

dev_df = pd.read_csv(dev_path, header=0, delimiter="\t", quoting=3)
dev_data_features = vectorizer.transform(dev_df["review"])
x_dev = dev_data_features.toarray()
y_dev = np.asarray(dev_df["sentiment"])
print( "DEV:", x_dev.shape )

test_df = pd.read_csv(test_path, header=0, delimiter="\t", quoting=3)
test_data_features = vectorizer.transform(test_df["review"])
x_test = test_data_features.toarray()
y_test = np.asarray(test_df["sentiment"])
print( "TEST:", x_test.shape )

count_train = x_train.shape[0]

Creating features from bag of words...
TRAIN: (5027, 5000)
DEV: (549, 5000)
TEST: (544, 5000)


### 1.2 Load the data

Note that batch size is chosen here.

In [None]:
# Load data into TENSORS

def load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=1 ):
    #batch_size = 1 # == no batch
    # create Tensor dataset
    train_data = TensorDataset(torch.from_numpy(x_train).to(torch.float), torch.from_numpy(y_train))

    # dataloaders
    # make sure to SHUFFLE your data
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    
    # Don t need batch at test time
    dev_data = TensorDataset(torch.from_numpy(x_dev).to(torch.float), torch.from_numpy(y_dev))
    dev_loader = DataLoader(dev_data, shuffle=True, batch_size=1)

    test_data = TensorDataset(torch.from_numpy(x_test).to(torch.float), torch.from_numpy(y_test))
    test_loader = DataLoader(test_data, shuffle=True, batch_size=1)

    print('BATCH SIZE:', batch_size)

    return train_loader, dev_loader, test_loader

### 1.3 Neural Network Definition

Now we can build our learning model.

▶▶**What are the elements that can be changed here?**

### SOLUTION
Note that here you can change: hidden_dim, number of hidden layers, activation function.

In [None]:
import torch
import torch.nn as nn

class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(FeedforwardNeuralNetModel, self).__init__()

        # Define the parameters that you will need. 
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim)

        # Non-linearity
        self.sigmoid = nn.Sigmoid()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

        print( 'INPUT DIM:', input_dim, 'HIDDEN DIM:', hidden_dim)

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.sigmoid(out) 

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)
        return out

### 1.4 Training function

Below, the code for a function that trains a model.

▶▶**What are the elements that can be changed here?**

### SOLUTION

Here you can change: the total number of epochs, the criterion / loss, the optimizer which also includes the learning rate.

In [None]:
# TRAINING
def train( model, train_loader, optimizer, num_epochs=5 ):
    for epoch in range(num_epochs):
        train_loss, total_acc, total_count = 0, 0, 0
        for input, label in train_loader:
            # Step1. Clearing the accumulated gradients
            optimizer.zero_grad()

            # Step 2. Forward pass to get output/logits
            outputs = model( input )

            # Step 3. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            # - Calculate Loss: softmax --> cross entropy loss
            loss = criterion(outputs, label)
            # - Getting gradients w.r.t. parameters
            loss.backward()
            # - Updating parameters
            optimizer.step()

            # Accumulating the loss over time
            train_loss += loss.item()
            total_acc += (outputs.argmax(1) == label).sum().item()
            total_count += label.size(0)

        # Compute accuracy on train set at each epoch
        print('Epoch: {}. Loss: {}. ACC {} '.format(epoch, train_loss/count_train, total_acc/count_train))
        
        total_acc, total_count = 0, 0
        train_loss = 0

### 1.5 Evaluation

Below you have the code for a function that can be used to evaluate the model: it prints the classification report and return the gold and predicted labels.

In [None]:
from sklearn.metrics import classification_report, accuracy_score


def evaluate( model, dev_loader ):
    predictions = []
    gold = []

    with torch.no_grad():
        for input, label in dev_loader:
            probs = model(input)
            predictions.append( torch.argmax(probs, dim=1).cpu().numpy()[0] )
            gold.append(int(label))

    print(classification_report(gold, predictions))
    return gold, predictions



## 2. Runing an experiment

Below a function that could be used to save the results, don't hesitate to write your own or modify it.

In [None]:
import os
# Save the scores and settings
my_expe = 'scores_ffnn_bow.txt'

def write_expe_settings( my_file, batch=1, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=5, score=0. ):
  with open( my_file, 'a' ) as f:
    f.write( 'batch:{batch}\thidden:{hidden}\thsize:{hsize}\tact:{act}\tlr:{lr}\topt:{opt}\tepochs:{epochs}\tscore:{score}\n'.format( 
        batch=batch, hidden=hidden, hsize=hsize, act=act, lr=lr, opt=opt, epochs=epochs, score=score ) )

### TEST #1

Start testing! The first test is the one with the 'default' parameters used in the previous practical session.

▶▶**Describe the setting of the 'default' experiment**

### SOLUTION

**BoW, 5000 features, Batch size: 1, 1 hidden layer, hidden size: 4, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: 5**

▶▶ **Run the model and evaluate on dev. Save the score for future comparison.**

In [None]:
# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=1 )

BATCH SIZE: 1


In [None]:
# Many choices here!
VOCAB_SIZE = MAX_FEATURES
input_dim = VOCAB_SIZE 
hidden_dim = 4
output_dim = 2

learning_rate = 0.1
num_epochs = 5

criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)

optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate

train( model_bow, train_loader, optimizer )

gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

INPUT DIM: 5000 HIDDEN DIM: 4
Epoch: 0. Loss: 0.5356872780121616. ACC 0.7266759498706983 
Epoch: 1. Loss: 0.3817175804624497. ACC 0.8283270340163119 
Epoch: 2. Loss: 0.3384615813505414. ACC 0.8597573105231748 
Epoch: 3. Loss: 0.2843738552404072. ACC 0.8828327034016312 
Epoch: 4. Loss: 0.26506259036045465. ACC 0.8905908096280087 
              precision    recall  f1-score   support

           0       0.71      0.88      0.79       230
           1       0.89      0.74      0.81       319

    accuracy                           0.80       549
   macro avg       0.80      0.81      0.80       549
weighted avg       0.82      0.80      0.80       549



In [None]:
write_expe_settings( my_expe, batch=1, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=5, score=accuracy )

## 3. Exercises

▶▶ **Now, try to change:**
1. Batch size 
2. Max number of epochs (with best batch size)
3. Size of the hidden layer
4. Activation function
5. Optimizer
6. Learning rate
7. Try with 1 additional layers 
 

How does this affect the loss and the performance of the model?

---
### 3.1 Batch size
---

Let's try with: 1, 10, 100, 1000

#### TEST #2

▶▶**Describe the setting of the second experiment**

#### SOLUTION

BoW, 5000 features, **Batch size: 10**, 1 hidden layer, hidden size: 4, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: 5

In [None]:
batch_size = 10

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
hidden_dim = 4
output_dim = 2
learning_rate = 0.1
num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=5, score=accuracy )

BATCH SIZE: 10
INPUT DIM: 5000 HIDDEN DIM: 4
Epoch: 0. Loss: 0.0595094999511507. ACC 0.6942510443604536 
Epoch: 1. Loss: 0.04239837287291395. ACC 0.821563556793316 
Epoch: 2. Loss: 0.032071483498785636. ACC 0.8703003779590213 
Epoch: 3. Loss: 0.027037093746738024. ACC 0.8903918838273324 
Epoch: 4. Loss: 0.023645152846399065. ACC 0.9063059478814403 
              precision    recall  f1-score   support

           0       0.74      0.91      0.81       230
           1       0.92      0.76      0.84       319

    accuracy                           0.83       549
   macro avg       0.83      0.84      0.82       549
weighted avg       0.84      0.83      0.83       549



#### TEST #3

BoW, 5000 features, **Batch size: 100**, 1 hidden layer, hidden size: 4, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: 5

In [None]:
batch_size = 100

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
hidden_dim = 4
output_dim = 2
learning_rate = 0.1
num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=5, score=accuracy )

BATCH SIZE: 100
INPUT DIM: 5000 HIDDEN DIM: 4
Epoch: 0. Loss: 0.006877064195048247. ACC 0.5866321861945495 
Epoch: 1. Loss: 0.006576847776352636. ACC 0.7012134473841257 
Epoch: 2. Loss: 0.006268150068453441. ACC 0.7370200915058683 
Epoch: 3. Loss: 0.0059588611042947735. ACC 0.7525363039586235 
Epoch: 4. Loss: 0.005636463612051259. ACC 0.7718321066242292 
              precision    recall  f1-score   support

           0       0.63      0.83      0.72       230
           1       0.85      0.65      0.74       319

    accuracy                           0.73       549
   macro avg       0.74      0.74      0.73       549
weighted avg       0.76      0.73      0.73       549



#### TEST #4

BoW, 5000 features, **Batch size: 1000**, 1 hidden layer, hidden size: 4, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: 5

In [None]:
batch_size = 1000

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
hidden_dim = 4
output_dim = 2
learning_rate = 0.1
num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=5, score=accuracy )

BATCH SIZE: 1000
INPUT DIM: 5000 HIDDEN DIM: 4
Epoch: 0. Loss: 0.0008264508640076072. ACC 0.5090511239307738 
Epoch: 1. Loss: 0.0008226839856264817. ACC 0.5090511239307738 
Epoch: 2. Loss: 0.0008219510275349599. ACC 0.533717923214641 
Epoch: 3. Loss: 0.0008196555313685554. ACC 0.6329818977521384 
Epoch: 4. Loss: 0.0008168000100217189. ACC 0.6461110005967774 
              precision    recall  f1-score   support

           0       0.64      0.41      0.50       230
           1       0.66      0.83      0.74       319

    accuracy                           0.66       549
   macro avg       0.65      0.62      0.62       549
weighted avg       0.65      0.66      0.64       549



#### Answers: Batch size

Increasing the batch size leads to a faster training a degradation in terms of performance

see e.g. (Keskar et al. 2016) https://arxiv.org/abs/1609.04836 :
* *It has been observed in practice that when using a larger batch there is a degradation in the quality of the model, as measured by its ability to generalize ... large-batch methods tend to converge to sharp minimizers of the training and testing functions-and as is well known, sharp minima lead to poorer generalization*

Trade-off between faster training and generalization ability.

People often test typical values of 32, 64, 128, 256, 512 and 1024 (but it depends also on the size of the training data, here we have a very small dataset).


---
### 3.2 Number of epochs
---

Let's try with: 5, 50

#### TEST #5

BoW, 5000 features, Batch size: 10, 1 hidden layer, hidden size: 4, activation: sigmoid, learning rate: 0.1, optimizer: SGD, **max epochs: 50**

In [None]:
batch_size = 10
max_epochs = 50

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
hidden_dim = 4
output_dim = 2
learning_rate = 0.1
#num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.06248715755264589. ACC 0.6560572906305948 
Epoch: 1. Loss: 0.043954694392028044. ACC 0.8187785955838472 
Epoch: 2. Loss: 0.03317398302700213. ACC 0.8671175651481997 
Epoch: 3. Loss: 0.027742600293360578. ACC 0.8931768450368013 
Epoch: 4. Loss: 0.023334740119928248. ACC 0.9098866122936144 
Epoch: 5. Loss: 0.02063481365438897. ACC 0.922816789337577 
Epoch: 6. Loss: 0.01796321026048628. ACC 0.9321663019693655 
Epoch: 7. Loss: 0.01650080053342878. ACC 0.9355480405808634 
Epoch: 8. Loss: 0.014617698005509348. ACC 0.9478814402227969 
Epoch: 9. Loss: 0.013585968076462463. ACC 0.9486771434255022 
Epoch: 10. Loss: 0.011394190632573103. ACC 0.9610105430674358 
Epoch: 11. Loss: 0.01055024223769336. ACC 0.9626019494728466 
Epoch: 12. Loss: 0.009543743463070452. ACC 0.9689675750944897 
Epoch: 13. Loss: 0.007826015936159433. ACC 0.9747364233141038 
Epoch: 14. Loss: 0.007118049319019281. ACC 0.9763278297195146 
Epoch: 15. Loss: 0.006425816581140528. ACC 0.9811020489357469 
Epoch: 16

#### ANSWER

Not much change after 20 iterations on train: convergence.

In [None]:
batch_size = 10
max_epochs = 10

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
hidden_dim = 4
output_dim = 2
learning_rate = 0.1
#num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=4, act='sigmoid', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.06291212701456006. ACC 0.6528744778197733 
Epoch: 1. Loss: 0.044857184989376006. ACC 0.8082355281480008 
Epoch: 2. Loss: 0.03335049673944552. ACC 0.8677143425502287 
Epoch: 3. Loss: 0.02790208174216242. ACC 0.8897951064253034 
Epoch: 4. Loss: 0.023848686240182167. ACC 0.9076984284861747 
Epoch: 5. Loss: 0.020894395210284623. ACC 0.92102645713149 
Epoch: 6. Loss: 0.018487477743628012. ACC 0.9313705987666601 
Epoch: 7. Loss: 0.016292144128643694. ACC 0.9409190371991247 
Epoch: 8. Loss: 0.014915780355308671. ACC 0.9413168888004774 
Epoch: 9. Loss: 0.013037947227960511. ACC 0.9508653272329421 
              precision    recall  f1-score   support

           0       0.89      0.74      0.81       230
           1       0.83      0.93      0.88       319

    accuracy                           0.85       549
   macro avg       0.86      0.84      0.84       549
weighted avg       0.85      0.85      0.85       549



---
### 3.3 Size of the hidden layer
---

Let's try with: 4, 16, 128, 5000

#### TEST #6

BoW, 5000 features, Batch size: 10, 1 hidden layer, **hidden size: 32**, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: 20

In [None]:
batch_size = 10
max_epochs = 5
hidden_dim = 64

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='sigmoid', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

BATCH SIZE: 10
INPUT DIM: 5000 HIDDEN DIM: 64
Epoch: 0. Loss: 0.06269910253965288. ACC 0.635567933160931 
Epoch: 1. Loss: 0.04317359960806021. ACC 0.8040580863337975 
Epoch: 2. Loss: 0.03290526890804499. ACC 0.8603540879252038 
Epoch: 3. Loss: 0.02682727060934908. ACC 0.8925800676347723 
Epoch: 4. Loss: 0.023164639555685547. ACC 0.9067037994827929 
              precision    recall  f1-score   support

           0       0.79      0.84      0.81       230
           1       0.88      0.84      0.86       319

    accuracy                           0.84       549
   macro avg       0.83      0.84      0.83       549
weighted avg       0.84      0.84      0.84       549



#### TEST #6

BoW, 5000 features, Batch size: XX, 1 hidden layer, **hidden size: 64**, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: XX

In [None]:
batch_size = 10
max_epochs = 5
hidden_dim = 128

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='sigmoid', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

BATCH SIZE: 10
INPUT DIM: 5000 HIDDEN DIM: 128
Epoch: 0. Loss: 0.06637925877114853. ACC 0.612691466083151 
Epoch: 1. Loss: 0.04425935451514161. ACC 0.7970956833101254 
Epoch: 2. Loss: 0.03306083544356604. ACC 0.8677143425502287 
Epoch: 3. Loss: 0.028170021697330457. ACC 0.884424109807042 
Epoch: 4. Loss: 0.023333700381086737. ACC 0.9031231350706187 
              precision    recall  f1-score   support

           0       0.70      0.93      0.80       230
           1       0.93      0.71      0.81       319

    accuracy                           0.80       549
   macro avg       0.82      0.82      0.80       549
weighted avg       0.84      0.80      0.80       549



#### TEST #7

BoW, 5000 features, Batch size: XX, 1 hidden layer, **hidden size: 5000**, activation: sigmoid, learning rate: 0.1, optimizer: SGD, max epochs: XX

In [None]:
batch_size = 10
max_epochs = 5
hidden_dim = 5000

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='sigmoid', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.23846807259557165. ACC 0.5100457529341555 
Epoch: 1. Loss: 0.15589830538423582. ACC 0.5187984881639148 
Epoch: 2. Loss: 0.13582247489722793. ACC 0.4941316888800477 
Epoch: 3. Loss: 0.10262338762281428. ACC 0.502287646707778 
Epoch: 4. Loss: 0.09239863767191361. ACC 0.4955241694847822 
              precision    recall  f1-score   support

           0       0.42      1.00      0.59       230
           1       0.00      0.00      0.00       319

    accuracy                           0.42       549
   macro avg       0.21      0.50      0.30       549
weighted avg       0.18      0.42      0.25       549



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Answer: Hidden layer size

Increase the performance in terms of macro F1, maybe? Longer training time.

No magic number, needs to be optimized. In general, a few typical values are tested:  
* *Using too few neurons in the hidden layers will result in something called underfitting. Underfitting occurs when there are too few neurons in the hidden layers to adequately detect the signals in a complicated data set.* 
* *too many neurons in the hidden layers may result in overfitting. Overfitting occurs when the neural network has so much information processing capacity that the limited amount of information contained in the training set is not enough to train all of the neurons in the hidden layers. A second problem [is increasing] the time it takes to train the network.*
* "Rules"
  * *The number of hidden neurons should be between the size of the input layer and the size of the output layer.* 
  * *The number of hidden neurons should be 2/3 the size of the input layer, plus the size of the output layer.* 
  * *The number of hidden neurons should be less than twice the size of the input layer.*

---
### 3.4 Activation function
---
Try with Sigmoid, Tanh, ReLU

https://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity

In [None]:
## WITH SIGMOID 
batch_size = 200
max_epochs = 10
hidden_dim = 16

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='tanh', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

BATCH SIZE: 200
INPUT DIM: 5000 HIDDEN DIM: 16
Epoch: 0. Loss: 0.0035867044483190126. ACC 0.5174060075591804 
Epoch: 1. Loss: 0.003538248943054157. ACC 0.585239705589815 
Epoch: 2. Loss: 0.003510344151784678. ACC 0.6097075790730058 
Epoch: 3. Loss: 0.0034701669692803454. ACC 0.6421324845832505 
Epoch: 4. Loss: 0.0034234431407929224. ACC 0.6624229162522379 
Epoch: 5. Loss: 0.003360011870708811. ACC 0.6954445991645116 
Epoch: 6. Loss: 0.0032856720257924714. ACC 0.6980306345733042 
Epoch: 7. Loss: 0.0032111859141369333. ACC 0.7256813208673165 
Epoch: 8. Loss: 0.0031274266619556153. ACC 0.7332405012930177 
Epoch: 9. Loss: 0.0030350037258520786. ACC 0.7475631589417148 
              precision    recall  f1-score   support

           0       0.72      0.56      0.63       230
           1       0.73      0.85      0.78       319

    accuracy                           0.73       549
   macro avg       0.73      0.70      0.71       549
weighted avg       0.73      0.73      0.72       549



#### TEST #7

BoW, 5000 features, Batch size: 10, 1 hidden layer, hidden size: 16, activation: tanh, learning rate: 0.1, optimizer: SGD, max epochs: 10

In [None]:
class FeedforwardNeuralNetModel2(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(FeedforwardNeuralNetModel2, self).__init__()

        # Define the parameters that you will need. 
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim)

        # Non-linearity -- to rewrite
        self.sigmoid = nn.Tanh()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.sigmoid(out) 

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)
        return out

In [None]:
batch_size = 200
max_epochs = 10
hidden_dim = 16

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel2(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='tanh', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

BATCH SIZE: 200
Epoch: 0. Loss: 0.0033632819330523924. ACC 0.6409389297791923 
Epoch: 1. Loss: 0.003020698030117802. ACC 0.7280684304754327 
Epoch: 2. Loss: 0.002754686407334339. ACC 0.7660632584046151 
Epoch: 3. Loss: 0.002551118700789343. ACC 0.7907300576884821 
Epoch: 4. Loss: 0.0023040566241406815. ACC 0.8136065247662622 
Epoch: 5. Loss: 0.0022498833355335444. ACC 0.8110204893574696 
Epoch: 6. Loss: 0.0020608256072819717. ACC 0.843843246469067 
Epoch: 7. Loss: 0.0019477623308632136. ACC 0.8444400238710961 
Epoch: 8. Loss: 0.001923756575001052. ACC 0.8408593594589219 
Epoch: 9. Loss: 0.0017911832411340912. ACC 0.8539884623035607 
              precision    recall  f1-score   support

           0       0.90      0.58      0.70       230
           1       0.76      0.95      0.84       319

    accuracy                           0.80       549
   macro avg       0.83      0.77      0.77       549
weighted avg       0.82      0.80      0.79       549



#### TEST #8

BoW, 5000 features, Batch size: 10, 1 hidden layer, hidden size: 16, activation: relu, learning rate: 0.1, optimizer: SGD, max epochs: 10

In [None]:
class FeedforwardNeuralNetModel3(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(FeedforwardNeuralNetModel3, self).__init__()

        # Define the parameters that you will need. 
        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim)

        # Non-linearity
        self.sigmoid = nn.ReLU()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim, output_dim)  

    def forward(self, x):
        # Linear function  # LINEAR
        out = self.fc1(x)

        # Non-linearity  # NON-LINEAR
        out = self.sigmoid(out) 

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)
        return out

In [None]:
batch_size = 200
max_epochs = 10
hidden_dim = 16

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel3(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='relu', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

BATCH SIZE: 200
Epoch: 0. Loss: 0.0035081197851714874. ACC 0.5742987865526159 
Epoch: 1. Loss: 0.003222808499731789. ACC 0.6904714541476029 
Epoch: 2. Loss: 0.0029210167315790233. ACC 0.7390093495126318 
Epoch: 3. Loss: 0.002751473495256884. ACC 0.755719116769445 
Epoch: 4. Loss: 0.0027018428168952262. ACC 0.7732245872289636 
Epoch: 5. Loss: 0.002733380817764674. ACC 0.7692460712154366 
Epoch: 6. Loss: 0.002285557752910513. ACC 0.8207678535906107 
Epoch: 7. Loss: 0.0023491217906512537. ACC 0.8132086731649095 
Epoch: 8. Loss: 0.002281326107697101. ACC 0.8122140441615278 
Epoch: 9. Loss: 0.0021961465469050746. ACC 0.8360851402426894 
              precision    recall  f1-score   support

           0       0.83      0.71      0.77       230
           1       0.81      0.90      0.85       319

    accuracy                           0.82       549
   macro avg       0.82      0.80      0.81       549
weighted avg       0.82      0.82      0.82       549



---
### 3.5 Optimizer
---
Try SGD, Adam

https://pytorch.org/docs/stable/optim.html#module-torch.optim

For a detailed description and comparison of the different optimizers:
https://ruder.io/optimizing-gradient-descent/

In [None]:
### WITH SGD  (and ReLU)

batch_size = 10
max_epochs = 10
hidden_dim = 16

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel3(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.SGD( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='relu', lr=0.1, opt='sgd', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.05626993505563939. ACC 0.7242888402625821 
Epoch: 1. Loss: 0.03875441925931053. ACC 0.8303162920230754 
Epoch: 2. Loss: 0.0302670874747944. ACC 0.8754724487766063 
Epoch: 3. Loss: 0.02549069301249885. ACC 0.8947682514422121 
Epoch: 4. Loss: 0.020774694624361927. ACC 0.9190371991247265 
Epoch: 5. Loss: 0.018724975738246812. ACC 0.9289834891585439 
Epoch: 6. Loss: 0.01547806462362299. ACC 0.9466878854187388 
Epoch: 7. Loss: 0.009750108654659074. ACC 0.9647901332802864 
Epoch: 8. Loss: 0.010688928858555264. ACC 0.9651879848816391 
Epoch: 9. Loss: 0.014519969835326583. ACC 0.9600159140640541 
              precision    recall  f1-score   support

           0       0.82      0.81      0.81       230
           1       0.87      0.87      0.87       319

    accuracy                           0.85       549
   macro avg       0.84      0.84      0.84       549
weighted avg       0.85      0.85      0.85       549



#### TEST #9

BoW, 5000 features, Batch size: 10, 1 hidden layer, hidden size: 16, activation: relu, learning rate: 0.1, optimizer: Adam, max epochs: 10

In [None]:
batch_size = 10
max_epochs = 10
hidden_dim = 16

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel3(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam( model_bow.parameters(), lr=learning_rate ) ## <---

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='relu', lr=0.1, opt='adam', epochs=max_epochs, score=accuracy )

BATCH SIZE: 10
Epoch: 0. Loss: 0.07213442693630269. ACC 0.51143823353889 
Epoch: 1. Loss: 0.07198922398053843. ACC 0.5390889198329023 
Epoch: 2. Loss: 0.06635210800175641. ACC 0.5595782773025662 
Epoch: 3. Loss: 0.06072889886439285. ACC 0.594987069822956 
Epoch: 4. Loss: 0.061436555608262215. ACC 0.584046150785757 
Epoch: 5. Loss: 0.061342544040007406. ACC 0.6210463497115576 
Epoch: 6. Loss: 0.05394889997301299. ACC 0.6377561169683708 
Epoch: 7. Loss: 0.05296870667108703. ACC 0.6628207678535906 
Epoch: 8. Loss: 0.05464682751914154. ACC 0.6574497712353292 
Epoch: 9. Loss: 0.051503366374794. ACC 0.6520787746170679 
              precision    recall  f1-score   support

           0       0.98      0.23      0.37       230
           1       0.64      1.00      0.78       319

    accuracy                           0.67       549
   macro avg       0.81      0.61      0.57       549
weighted avg       0.78      0.67      0.61       549



#### More about optimizers

Stochastic gradient descent maintains a single learning rate for all weight updates and the learning rate does not change during training.

The ADAM algorithms leverages the power of adaptive learning rates methods to find individual learning rates for each parameter.

Adaptive Gradient Algorithm (AdaGrad): maintains a per-parameter learning rate that improves performance on problems with sparse gradients (e.g. natural language and computer vision problems).

Root Mean Square Propagation (RMSProp): also maintains per-parameter learning rates that are adapted based on the average of recent magnitudes of the gradients for the weight (e.g. how quickly it is changing). This means the algorithm does well on online and non-stationary problems (e.g. noisy).
Adam realizes the benefits of both AdaGrad and RMSProp.

from: https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/ and https://towardsdatascience.com/adam-latest-trends-in-deep-learning-optimization-6be9a291375c

---
### 3.6 Learning rate
---

#### TEST #10

BoW, 5000 features, Batch size: 10, 1 hidden layer, hidden size: 16, activation: relu, **learning rate: 0.5**, optimizer: Adam, max epochs: 10

In [None]:
batch_size = 10
max_epochs = 10
hidden_dim = 16
learning_rate = 0.5

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
#learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel3(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='relu', lr=learning_rate, opt='adam', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.11304628426330995. ACC 0.5046747563158942 
Epoch: 1. Loss: 0.08354000300495056. ACC 0.5040779789138651 
Epoch: 2. Loss: 0.07644790185714351. ACC 0.5016908693057489 
Epoch: 3. Loss: 0.0714663850258045. ACC 0.5203898945693256 
Epoch: 4. Loss: 0.07242015330815661. ACC 0.516610304356475 
Epoch: 5. Loss: 0.07200485677777926. ACC 0.5122339367415953 
Epoch: 6. Loss: 0.07188668697094239. ACC 0.5020887209071017 
Epoch: 7. Loss: 0.07396689143312696. ACC 0.4981102048935747 
Epoch: 8. Loss: 0.07325427189413017. ACC 0.49492739208275316 
Epoch: 9. Loss: 0.07357010994679368. ACC 0.49074995026854984 
              precision    recall  f1-score   support

           0       0.42      1.00      0.59       230
           1       0.00      0.00      0.00       319

    accuracy                           0.42       549
   macro avg       0.21      0.50      0.30       549
weighted avg       0.18      0.42      0.25       549



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### TEST #11

BoW, 5000 features, Batch size: 10, 1 hidden layer, hidden size: 16, activation: relu, **learning rate: 0.001**, optimizer: Adam, max epochs: 10

In [None]:
batch_size = 10
max_epochs = 10
hidden_dim = 16
learning_rate = 0.001

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
#learning_rate = 0.1
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel3(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=1, hsize=hidden_dim, act='relu', lr=learning_rate, opt='adam', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.04085205458378801. ACC 0.8344937338372787 
Epoch: 1. Loss: 0.01722472314045696. ACC 0.9476825144221206 
Epoch: 2. Loss: 0.010114406593955768. ACC 0.9677740202904317 
Epoch: 3. Loss: 0.006535387269435412. ACC 0.9820966779391287 
Epoch: 4. Loss: 0.004442139458302994. ACC 0.9898547841655063 
Epoch: 5. Loss: 0.002959595974119527. ACC 0.9930375969763279 
Epoch: 6. Loss: 0.0021469107372873615. ACC 0.9958225581857967 
Epoch: 7. Loss: 0.0013920834114515715. ACC 0.9980107419932365 
Epoch: 8. Loss: 0.0011005863557361237. ACC 0.9986075193952656 
Epoch: 9. Loss: 0.0014661112178039036. ACC 0.9964193355878257 
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       230
           1       0.86      0.85      0.85       319

    accuracy                           0.83       549
   macro avg       0.83      0.83      0.83       549
weighted avg       0.83      0.83      0.83       549



---
### 3.7 Number of hidden layers
---

#### TEST #12

BoW, 5000 features, Batch size: 10, 2 hidden layer, hidden size: 16, activation: sigmoid, learning rate: 0.01, optimizer: adam, max epochs: 10

In [None]:
import torch
import torch.nn as nn

class FeedforwardNeuralNetModel2(nn.Module):
    def __init__(self, input_dim, hidden_dim, hidden_dim2, output_dim):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(FeedforwardNeuralNetModel2, self).__init__()

        # Define the parameters that you will need. 
        # -- LAYER 1
        # Linear function
        self.fc1 = nn.Linear( input_dim, hidden_dim )
        ## -- LAYER 2
        self.fc3 = nn.Linear( hidden_dim, hidden_dim2 )

        # Non-linearity
        self.sigmoid = nn.ReLU()

        # Linear function (readout)
        self.fc2 = nn.Linear(hidden_dim2, output_dim)  

    def forward(self, x):
        # -- LAYER 1
        # Linear function  # LINEAR
        out = self.fc1(x)
        # Non-linearity  # NON-LINEAR
        out = self.sigmoid(out) 

        # -- LAYER 2
        out = self.fc3(out)
        out = self.sigmoid(out)

        # Linear function (readout)  # LINEAR
        out = self.fc2(out)
        return out

In [None]:
batch_size = 10
max_epochs = 10
hidden_dim = 16
hidden_dim2 = 32

# Load data
train_loader, dev_loader, test_loader = load_data( x_train, y_train, x_dev, y_dev, x_test, y_test, batch_size=batch_size )

# Hyper-parameters
##hidden_dim = 4
output_dim = 2
learning_rate = 0.001
##num_epochs = 5
criterion = nn.CrossEntropyLoss()

# Initialization of the model
model_bow = FeedforwardNeuralNetModel2(input_dim, hidden_dim, hidden_dim2, output_dim)
optimizer = torch.optim.Adam( model_bow.parameters(), lr=learning_rate )

# Train and evaluate
train( model_bow, train_loader, optimizer, num_epochs=max_epochs )
gold, pred = evaluate( model_bow, dev_loader )
accuracy = accuracy_score( gold, pred )

write_expe_settings( my_expe, batch=batch_size, hidden=2, hsize=str(hidden_dim)+';'+str(hidden_dim2), act='relu', lr=0.1, opt='adam', epochs=max_epochs, score=accuracy )

Epoch: 0. Loss: 0.04112395134785789. ACC 0.8191764471851999 
Epoch: 1. Loss: 0.015822856690344087. ACC 0.9435050726079173 
Epoch: 2. Loss: 0.009069929567643376. ACC 0.9681718718917843 
Epoch: 3. Loss: 0.004802363096739822. ACC 0.9858762681519793 
Epoch: 4. Loss: 0.002587109142221498. ACC 0.9914461905709171 
Epoch: 5. Loss: 0.004202911742800717. ACC 0.9884623035607718 
Epoch: 6. Loss: 0.0019092409496743834. ACC 0.9942311517803859 
Epoch: 7. Loss: 0.0007101426964987467. ACC 0.9978118161925602 
Epoch: 8. Loss: 0.0005763179586664687. ACC 0.9984085935945892 
Epoch: 9. Loss: 0.0005077990493367739. ACC 0.9986075193952656 
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       230
           1       0.89      0.84      0.86       319

    accuracy                           0.85       549
   macro avg       0.84      0.85      0.84       549
weighted avg       0.85      0.85      0.85       549

