This notebook collects some trials with Weights and Biases. It is not really cleaned.

# Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import time
import string
import re
from collections import Counter
import random
from typing import *
import csv

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

SEED = 10

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import from Preprocessing

In [4]:
def read_dataset(data_path: str) -> (List[List[str]], List[List[str]]):
    """
    Returns
    -------
        Two lists of lists containing sentences and labels respectively.
        
    Parameters
    ----------
    data_path: str
        Data path of the dataset.
    """
    
    sentences_list = []
    labels_list = []
    
    with open(data_path, "r", encoding='utf-8') as f:
        for line in f:
            row = line.strip()
            
            if row.startswith("#\tid"): # New sentence
                sentence = []
                labels = []
                
            elif row == "": # End of the sentence
                sentences_list.append(sentence)
                labels_list.append(labels)
            
            else: # Words of a sentence
                word, label = row.split('\t')
                
                sentence.append(word)
                labels.append(label)
   
    return sentences_list, labels_list

In [5]:
# Utility function taken from the 'evaluate.py' script
def flat_list(l: List[List[Any]]) -> List[Any]:
    """
    Returns
    -------
        A single list containing all elements that
        were in the input list.
        
    Parameters
    ----------
    l: List[List[Any]]
        A list of lists of any type
    """
    return [_e for e in l for _e in e]

In [6]:
def freq_most_common_tokens(dataset_text: List[List[str]], n: int = 20) -> dict:
    """
    Returns
    -------
        The first n common tokens and their frequencies, where the tokens are
        retrieved from the list 'dataset_text'.
        
    Parameters
    ----------
    dataset_text: List[List[str]]
        A list of lists of strings. 
        In this case each nested list is a sentence.
    
    n: int
        Indicates how many tokens to consider.
        If it is a negative number, 
        the function returns the frequencies of all the tokens in the dataset.
    
    """
    # The input is flattened
    tokens = flat_list(dataset_text)  

    # If negative number, return the frequency of all the tokens
    if n <= -1:
        return dict(Counter(tokens).most_common(len(Counter(tokens))))
    else:
        return dict(Counter(tokens).most_common(n))

In [7]:
train_pos_dep_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/data/train_pos_dep.tsv"
valid_pos_dep_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/data/valid_pos_dep.tsv"

In [8]:
train_pos_dep_path = "../../data/train_pos_dep.tsv"
valid_pos_dep_path = "../../data/valid_pos_dep.tsv"

In [9]:
def read_dataset_pos_dep(data_path: str) -> (List[List[str]], List[List[str]], List[List[str]], List[List[str]]):
    """
    Returns
    -------
        Four lists of lists containing sentences and POS labels,
        headwords labels and dependencies labels respectively.
        
    Parameters
    ----------
    data_path: str
        Data path of the dataset.
    """

    sentences_list = []
    labels_list = []
    heads_list = []
    dependencies_list = []
    
    with open(data_path, "r", encoding='utf-8') as f:
        for line in f:
            row = line.strip()
            
            if row.startswith("#\tid"): # New sentence
                sentence = []
                labels = []
                dependencies = []
                heads = []
                
            elif row == "": # End of the sentence
                sentences_list.append(sentence)
                labels_list.append(labels)
                heads_list.append(heads)
                dependencies_list.append(dependencies)
            
            else: # Words of a sentence
                word, label, head, dep = row.split('\t')
                
                sentence.append(word)
                labels.append(label)
                heads.append(head)
                dependencies.append(dep)
   
    return sentences_list, labels_list, heads_list, dependencies_list

In [10]:
_, train_pos, train_heads, train_dep = read_dataset_pos_dep(train_pos_dep_path)
_, valid_pos, valid_heads, valid_dep = read_dataset_pos_dep(valid_pos_dep_path)

In [11]:
train_clean_path = "../../data/train_clean.tsv"
valid_clean_path = "../../data/valid_clean.tsv"

In [10]:
train_clean_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/data/train_clean.tsv"
valid_clean_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/data/valid_clean.tsv"

In [12]:
train_sentences, train_labels = read_dataset(train_clean_path)
valid_sentences, valid_labels = read_dataset(valid_clean_path)

In [13]:
pad_token = "<PAD>"
unk_token = "<UNK>"

In [14]:
vocab_path = "../../data/vocab.tsv"

In [13]:
vocab_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/data/vocab.tsv"

In [15]:
def read_vocab(path: str) -> dict:
    """
    Returns
    -------
        A dictionary that maps tokens to integers.
        
    Parameters
    ----------
    path: str
        Data path of the dictionary.
    """
    vocab = {}
    with open(path, 'r', newline="", encoding='utf-8') as f:
        for line in f:
            line = line.strip().split('\t')
            vocab[line[0]] = int(line[1])
    return vocab

In [16]:
vocab = read_vocab(vocab_path)

In [17]:
len(vocab)

20000

In [20]:
dep_vocab_path = "../../data/dep_vocab.tsv"

In [19]:
dep_vocab_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/data/dep_vocab.tsv"

In [21]:
dep_vocab = read_vocab(dep_vocab_path)

In [22]:
len(dep_vocab)

47

In [23]:
def assign_unique_idx(labels_list: List[List[str]], pad_token: str) -> (dict, dict):
    """
    Returns
    -------
        Two dictionaries. The first one is a map
        from labels to integers; the second one
        is the reversed map.
        
    Parameters
    ----------
    labels_list: List[List[str]]
        A list of lists of strings. 
        In this case each nested list is a sentence,
        containing labels.
        
    pad_token: str
         String which identifies the padding token.
    """
    labels = freq_most_common_tokens(labels_list, n=-1)
    lab2idx = {label: idx+1 for idx, label in enumerate(labels)}
    lab2idx[pad_token] = 0 # The padding token is associated to the first position
    idx2lab = {idx: label for label, idx in lab2idx.items()} 
    
    return lab2idx, idx2lab

In [24]:
lab2idx, idx2lab = assign_unique_idx(train_labels, pad_token)

In [25]:
pos2idx, pos2lab = assign_unique_idx(train_pos, pad_token)

# Pretrained Word Embedding

## Word2Vec

In [None]:
from gensim.models import *
from gensim.models.word2vec import *

In [None]:
# ! pip install gensim==4.1.2

In [None]:
pretrained_vocab_path = "../../model/vocab.txt"

In [None]:
w2v_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/model/w2v_clean.pth"

In [None]:
w2v_path = '../../model/w2v_clean.pth'

In [None]:
pretrained_embeddings = torch.load(w2v_path)

In [None]:
pretrained_embeddings.shape

torch.Size([10000, 300])

## GloVe

In [26]:
pretrained_glove_path = "../../model/pretrained/load_embeddings/pre_glove.pth"

In [24]:
pretrained_glove_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/model/pretrained/load_embeddings/pre_glove.pth"

In [27]:
glove_pretrained_embeddings = torch.load(pretrained_glove_path)

In [28]:
glove_pretrained_embeddings.shape

torch.Size([20000, 100])

## Fastext

In [29]:
pretrained_fast_path = "../../model/pretrained/load_embeddings/pre_fast.pth"

In [28]:
pretrained_fast_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/model/pretrained/load_embeddings/pre_fast.pth"

In [30]:
fast_pretrained_embeddings = torch.load(pretrained_fast_path)

In [31]:
fast_pretrained_embeddings.shape

torch.Size([20000, 300])

# Dataset

In [32]:
class NER_Dataset(Dataset):
    """
    Parameters
    ----------
    sentences: List[List[str]]
        A list of lists of strings where each nested list represents a sentence.
        
    sentences_labels: List[List[str]]
        A list of lists of strings where each nested list represents a sentence,
        containing the labels of the tokens.
    """
    def __init__(self, sentences: List[List[str]], sentences_labels: List[List[str]]):

        assert len(sentences) == len(sentences_labels), \
                "Inputs must be of the same length"
        
        self.sentences = sentences
        self.labels = sentences_labels
        
        self.sentences_lengths = [len(s) for s in sentences]
  
        self.Y = self._from_sequence_to_idx(sentences_labels, lab2idx)
        self.X = self._from_sequence_to_idx(sentences, vocab, unk_token)
     
    
    def _from_sequence_to_idx(self, sequences_list: List[List[str]],
                              vocab: dict, unk_token: str = None) -> List[List[int]]:
        """
        Returns
        -------
            A list of lists of int built by replacing 
            each token with its corresponding id in the vocabulary.
            This is a general function so it works also for labels.
            
        Parameters
        ----------
        sequences_list: List[List[str]]
            A list of lists of strings where each nested list represents a sentence.
            
        vocab: dict
            The map that associates to each token an unique number.

        unk_token: str
            The OOV token.
        """
        
        sequences_idx = []
        
        if unk_token is not None: # For words
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token, vocab[unk_token]) for token in sentence])
        else: # For labels
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token) for token in sentence])
        
        return sequences_idx

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.Y[idx], self.X[idx], self.sentences_lengths[idx]

In [33]:
class NER_POS_Dataset(Dataset):
    def __init__(self, sentences, sentences_labels, sentences_pos):

        assert len(sentences) == len(sentences_labels) == len(sentences_pos), \
                "Inputs must be of the same length"
        
        self.sentences = sentences
        self.labels = sentences_labels
        self.pos = sentences_pos
        
        
        self.sentences_lengths = [len(s) for s in sentences]
#         self.tokens_lengths = [([len(token) for token in sentence]) for sentence in sentences]
        
        self.Y = self._from_sequence_to_idx(sentences_labels, lab2idx, pad_token)
        self.X = self._from_sequence_to_idx(sentences, vocab, pad_token, unk_token)
        # POS
        self.X_pos = self._from_sequence_to_idx(sentences_pos, pos2idx, pad_token)
#         self.X_chars = self._from_tokens_to_char_idx(char_vocab, pad_token, unk_token)
        
    
    def _from_sequence_to_idx(self, sequences_list, vocab, pad_token, unk_token = None):
        sequences_idx = []
        
        if unk_token is not None: # For words
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token, vocab[unk_token]) for token in sentence])
        else: # For labels
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token) for token in sentence])
        
        return sequences_idx
    
    def _from_tokens_to_char_idx(self, vocab, pad_token, unk_token):
        sequences_idx = []
        
        for sentence in self.sentences:
            sequences_idx.append([[char_vocab.get(c, char_vocab[unk_token]) for c in token] 
                                     for token in sentence])
            
        return sequences_idx
    
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.Y[idx], self.X[idx], self.sentences_lengths[idx], self.X_pos[idx]#, self.X_chars[idx], self.tokens_lengths[idx]

In [34]:
class NER_Dataset(Dataset):
    def __init__(self, sentences, sentences_labels):

        assert len(sentences) == len(sentences_labels), \
                "Inputs must be of the same length"
        
        self.sentences = sentences
        self.labels = sentences_labels
        
        self.sentences_lengths = [len(s) for s in sentences]
#         self.tokens_lengths = [([len(token) for token in sentence]) for sentence in sentences]
        
        self.Y = self._from_sequence_to_idx(sentences_labels, lab2idx, pad_token)
        self.X = self._from_sequence_to_idx(sentences, vocab, pad_token, unk_token)
#         self.X_chars = self._from_tokens_to_char_idx(char_vocab, pad_token, unk_token)
        
    
    def _from_sequence_to_idx(self, sequences_list, vocab, pad_token, unk_token = None):
        sequences_idx = []
        
        if unk_token is not None: # For words
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token, vocab[unk_token]) for token in sentence])
        else: # For labels
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token) for token in sentence])
        
        return sequences_idx
    
#     def _from_tokens_to_char_idx(self, vocab, pad_token, unk_token):
#         sequences_idx = []
        
#         for sentence in self.sentences:
#             sequences_idx.append([[char_vocab.get(c, char_vocab[unk_token]) for c in token] 
#                                      for token in sentence])
            
#         return sequences_idx
    
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.Y[idx], self.X[idx], self.sentences_lengths[idx]#, self.X_chars[idx], self.tokens_lengths[idx]

In [35]:
train_dataset = NER_Dataset(train_sentences, train_labels)
valid_dataset = NER_Dataset(valid_sentences, valid_labels)

In [36]:
def pad_sequence(sequence: List[Any], max_length: int, pad_token: str) -> List[Any]:
    padded_sequence = [pad_token] * max_length

    for i, token in enumerate(sequence):
        padded_sequence[i] = token

    return padded_sequence

In [37]:
def collate_batch(batch):
    labels_list = []
    features_list = []

    labels, features, sentences_lengths = zip(*batch)
    
    sorted_batch = sorted(zip(labels, features, sentences_lengths), 
                          key=lambda p: len(p[0]), reverse=True)
    labels, features, sentence_lengths = zip(*sorted_batch)
    

        
    max_length_in_batch = np.max(sentence_lengths)
    
    # Pad sentences and labels to the length of the longest sequence in the batch
    for idx, feature in enumerate(features):
        features_list.append(pad_sequence(feature, max_length_in_batch, vocab[pad_token]))
        labels_list.append(pad_sequence(labels[idx], max_length_in_batch, vocab[pad_token]))
      

    labels_tensor = torch.LongTensor(labels_list).to(device)
    features_tensor = torch.LongTensor(features_list).to(device)

    return labels_tensor, features_tensor, sentence_lengths


In [32]:
! pip install TorchCRF

Collecting TorchCRF
  Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Installing collected packages: TorchCRF
Successfully installed TorchCRF-1.1.0


In [38]:
from TorchCRF import CRF
class NER_Classifier(nn.Module):
    def __init__(self, h_params):
        super().__init__()

        # Fasttext
        self.fast_embeddings = self._from_pretrained_embeddings(h_params['fast_embeddings'],
                                                               h_params['vocab_size'],
                                                               h_params['fast_embed_dim'],
                                                               freeze=h_params['freeze_fast'])
            
            

        # Glove
        self.glove_embeddings = self._from_pretrained_embeddings(h_params['glove_embeddings'],
                                                            h_params['vocab_size'],
                                                            h_params['glove_embed_dim'],
                                                            freeze=h_params['freeze_glove'], 
                                                            )
        
        lstm_input_dim = h_params['fast_embed_dim'] + h_params['glove_embed_dim']
        
        # Word embeddings
        self.lstm = nn.LSTM(lstm_input_dim, 
                            h_params['lstm_hidden_dim'], 
                            bidirectional=h_params['bidirectional'],
                            num_layers=h_params['num_layers'],
                            dropout=h_params['dropout'] if h_params['num_layers'] > 1 else 0,
                            batch_first=True)
        
        
        lstm_output_dim = h_params['lstm_hidden_dim'] if h_params['bidirectional'] is False \
                            else h_params['lstm_hidden_dim'] * 2
        
   
        self.dropout = nn.Dropout(h_params['dropout'])  

        self.concat = nn.Linear(lstm_output_dim, lstm_output_dim)

        self.classifier = nn.Linear(lstm_output_dim, h_params['num_classes'])
        
        self.relu = nn.LeakyReLU()
        
        if h_params['use_crf']:
            self.crf = CRF(h_params['num_classes'])

        self._init_linear_weights()
        
        
    def forward(self, x, x_lengths):
        x_fast = self.fast_embeddings(x)
        x_glove = self.glove_embeddings(x)
        
        x = torch.cat((x_fast, x_glove), dim=2)  
        x = self.dropout(x)
        
        x, _ = self.lstm(x)
        x = self.relu(x)
        
        x = self.concat(x)
        x = self.relu(x)

        output = self.classifier(x)

        return output


    def _from_pretrained_embeddings(self, pretrained_embeddings, vocab_size, embed_dim, freeze: bool):
        embeddings = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Get emebeddings from pretrained ones
        embeddings.weight.data.copy_(pretrained_embeddings)
        
        # Freeze embeddings
        embeddings.weight.requires_grad = not freeze 
        
        return embeddings
    
    def _init_linear_weights(self):
        initrange = 0.5

        self.concat.weight.data.uniform_(-initrange, initrange)
        self.concat.bias.data.zero_()

        self.classifier.weight.data.uniform_(-initrange, initrange)
        self.classifier.bias.data.zero_()

# Training

In [33]:
! pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 132 kB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=56e00243999780315db64c26322ebe01e7487e0768d09192e5fb5d3422d67a40
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [39]:
from seqeval.metrics import f1_score
def compute_score(predictions, labels, viterbi_pred = None):
    y_true_list = []
    y_pred_list = []
    
    if pad_token != None:
        mask = labels != lab2idx[pad_token]
        
    else:
        mask = labels != -1
    
   
    labels = labels[mask].tolist() 
    y_true = [idx2lab[l] for l in labels]
    y_true_list.append(y_true)
    
    
    if viterbi_pred is not None:
        y_pred = [idx2lab[l] for l in viterbi_pred]
        y_pred_list.append(y_pred)
        
    else:  
        predictions = predictions.argmax(1)
        predictions = predictions[mask].tolist()
    
        y_pred = [idx2lab[l] for l in predictions]
        y_pred_list.append(y_pred)

    
    return f1_score(y_true_list, y_pred_list, average='macro')

In [40]:
def train(model, dataloader, h_params, optimizer, criterion, grad_clipping):
    model.train()    
    running_loss = 0.0
    f1_score = 0.0
    viterbi_pred = None
    
    for idx, (labels, features, sentences_lengths) in enumerate(dataloader): 
        # Empty gradients
        optimizer.zero_grad()

        # Forward
        predicted_labels = model(features, sentences_lengths)

        
        if h_params['use_crf']:
            mask = (labels != lab2idx[pad_token])
            
            log_likelihood = model.crf(predicted_labels, labels, mask=mask)
            # Predictions
            viterbi_pred = flat_list(model.crf.viterbi_decode(predicted_labels, mask=mask))        
                
                
            # The log likelihood is not normalized 
            # (It is not divided by the batch size and it is negative)
            loss = torch.mean(log_likelihood) * -1

            predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
            labels = labels.view(-1)
            
        else:
            predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
            labels = labels.view(-1)
            loss = criterion(predicted_labels, labels)
            

         # Backward  
        loss.backward()
        
        # Gradient Clipping to prevent exploding gradients
        if grad_clipping is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clipping)
        # Update weights 
        optimizer.step()
        
        running_loss += loss.item()
#         f1_score += compute_score(predicted_labels, labels, viterbi_pred)

        if idx > 0 and idx % 50 == 0:
            metrics = {"train/batch_loss": running_loss/idx}
            wandb.log(metrics)
            
    # Loss at the end of the epoch 
    return running_loss/len(dataloader), f1_score/len(dataloader)

In [41]:
def evaluate(model, dataloader, h_params, criterion):
    model.eval()
    valid_loss = 0.0
    f1_score = 0.0
    viterbi_pred = None
    
    with torch.no_grad():
        for idx, (labels, features,sentences_lengths) in enumerate(dataloader):
            predicted_labels = model(features, sentences_lengths)
            
            if h_params['use_crf']:
                mask = (labels != lab2idx[pad_token])
                log_likelihood = model.crf(predicted_labels, labels, mask=mask) 
                
                viterbi_pred = flat_list(model.crf.viterbi_decode(predicted_labels, mask=mask))        
                
                
                # The log likelihood is not normalized 
                # (It is not divided by the batch size and it is negative)
                loss = torch.mean(log_likelihood) * -1

                predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
                labels = labels.view(-1)
                

            else:
                predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
                labels = labels.view(-1)
                loss = criterion(predicted_labels, labels)
                
               
            valid_loss += loss.item()
            f1_score += compute_score(predicted_labels, labels, viterbi_pred)

            if idx > 0 and idx % 10 == 0:
                metrics = {"valid/batch_loss": valid_loss/idx, 
                            "valid/batch_f1": f1_score/idx}
                wandb.log(metrics)
            
    return valid_loss/len(dataloader), f1_score/len(dataloader)

In [42]:
def train_model(
    model: nn.Module,
    train_dataloader,
    valid_dataloader,
    h_params,
    optimizer: torch.optim.Optimizer,
    criterion,
    scheduler,
    grad_clipping,
    epochs: int,
    early_stopping: bool = False,
    early_stopping_mode: str = 'max',
    early_stopping_patience: int = 0,
):
    train_losses = []
    valid_losses = []
    valid_f1_scores = []
    patience_counter = 0
    
    # epoch_time = 0.0
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        
        # Train
        train_loss, _ = train(model, train_dataloader, h_params,
                              optimizer, criterion, grad_clipping)
        train_losses.append(train_loss)
        
        # Evaluate
        valid_loss, valid_f1_score = evaluate(model, valid_dataloader,
                                             h_params, criterion)
        valid_losses.append(valid_loss)
        valid_f1_scores.append(valid_f1_score)
        
        
        
        # If the model starts overfitting, then the learning rate is decreased
        # if valid_loss is not None and train_loss < valid_loss:
        if scheduler is not None:
            print(f"LR: {scheduler.get_last_lr()[0]:.6f}")
            scheduler.step()


        metrics = {"train/epoch_loss": train_loss, 
                    "valid/epoch_loss": valid_loss,
                    "valid/f1_score": valid_f1_score}

        wandb.log(metrics)
        
        print('-' * 100)
        epoch_time = time.time() - epoch_start_time
        print(f'| epoch {epoch:3d}/{epochs:d} | time: {epoch_time:5.2f}s | ' \
            f'train_loss: {train_loss:.3f} | valid_loss: {valid_loss:.3f} | valid_f1_score: {valid_f1_score:.3f}')
            
        print('-' * 100)
        
        if valid_f1_score < 0.1:
            print("Too Bad...")
            return
        
        if early_stopping and len(valid_f1_scores) >= 2:

#             stop = early_stopping_mode == 'min' and epoch > 0 and valid_f1_scores[-1] > valid_f1_scores[-2]
            stop = early_stopping_mode == 'max' and epoch > 0 and valid_f1_scores[-1] < valid_f1_scores[-2]
            if stop:
                if patience_counter >= early_stopping_patience:
                    print('Early stop.')
                    break
                else:
                    print('-- Patience.\n')
                    patience_counter += 1
        
        histories = {
            "train_losses": train_losses,
            "valid_losses": valid_losses,
            "valid_f1_scores": valid_f1_scores
        }
    return histories

In [None]:
! pip install wandb

In [43]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mflorin-ml[0m (use `wandb login --relogin` to force relogin)


True

In [None]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'valid/f1_score',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    # 'optimizer': {
    #     'values': ['sgd'],
    #     },
    'lstm_hidden_dim': {
        'values': [64, 128, 256, 512]
        },

#     'dropout': {
#         'distribution': 'uniform',
#         'min': 0.4,
#         'max': 0.6,
#     },
    'dropout': {
        "values": [.4, .5, .6, .7]
    },
    
    'lr': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.0008,
        'max': 0.02,
      },
    # 'momentum': {
    #     'distribution': 'uniform',
    #     'min': 0.1,
    #     'max': 0.99,
    # },
    
    'grad_clipping': {
         # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 1,
        'max': 5
    },
    
    "batch_size": {
            'values': [64, 128, 256]
        },

    "epochs": {
        'values': [15]
    },

    # "freeze_fast": {
    #     "values": [True, False]
    # },
    # "freeze_glove": {
    #     "values": [True, False]
    # },
    "freeze_fast": {
        "values": [True, False]
    },
    "freeze_glove": {
        "values": [True, False]
    },

    "sceduler_step": {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 7
    },
    "sceduler_gamma": {
        'distribution': 'uniform',
        'min': 0.1,
        'max': 0.6
    },
    
    "num_layers": {
        "values": [2,3]
    },    
}

sweep_config['parameters'] = parameters_dict

In [44]:
sweep_config = {
    'method': 'grid'
    }

metric = {
    'name': 'valid/f1_score',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    'lstm_hidden_dim': {
        'values': [128, 256, 512]
        },

    'dropout': {
        "values": [.4, .5,]
    },
    
    'lr': {
        'values': [0.003, 0.004]
      },

    'grad_clipping': {
         "values": [0.7, 1, 2, 2.5]
    },
    
    "batch_size": {
            'values': [64, 128]
        },

    "epochs": {
        'values': [12]
    },

    "freeze_fast": {
        "values": [True]
    },
    "freeze_glove": {
        "values": [False]
    },

    "sceduler_step": {
        "values": [4,5,6]
    },
    "sceduler_gamma": {
        "values": [0.2, 0.3]
    },
    
    "num_layers": {
        "values": [3]
    },   
    
    
}

sweep_config['parameters'] = parameters_dict

In [45]:
sweep_id = wandb.sweep(sweep_config, project="NLP_HW01_last")

Create sweep with ID: k1ep9l3e
Sweep URL: https://wandb.ai/florin-ml/NLP_HW01_last/sweeps/k1ep9l3e


In [46]:
import multiprocessing
def start():

    wandb.init(project="NLP_HW01_last")
    config = wandb.config

    h_params = {
        'vocab_size': len(vocab),
        'fast_embed_dim': 300,
        'freeze_fast': config.freeze_fast,
        'glove_embed_dim': 100,
        'freeze_glove': config.freeze_glove,
        'lstm_hidden_dim': config.lstm_hidden_dim, 
        'num_classes': len(lab2idx),
        'fast_embeddings': fast_pretrained_embeddings,
        'glove_embeddings': glove_pretrained_embeddings,
        'bidirectional': True,
        'num_layers': config.num_layers,
        'dropout': config.dropout,
        'use_crf': True,  # set to true to test with the Conditional Random Field
    }

    model = NER_Classifier(h_params).to(device)

    # Hyperparameters
    epochs = config.epochs # number of epochs
    lr = config.lr # learning rate .005
    bacth_size = config.batch_size # batch size for training 64
    grad_clipping = config.grad_clipping # for clipping gradients         
        
                                                # ignore the padding class
    criterion = torch.nn.CrossEntropyLoss(ignore_index=lab2idx[pad_token]).to(device)
    # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.95)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     scheduler = None
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config['sceduler_step'], gamma=config['sceduler_gamma'])
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[6, 9, 13, 20, 22], gamma=0.1)
    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.002, max_lr=0.01)

    train_dataloader = DataLoader(train_dataset, batch_size=bacth_size, 
                                collate_fn=collate_batch, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=bacth_size, 
                                collate_fn=collate_batch, shuffle=False)

    histories = train_model(model, train_dataloader, valid_dataloader, h_params, 
                            optimizer, criterion, scheduler, grad_clipping, epochs,
                            early_stopping=True, early_stopping_mode="max", 
                            early_stopping_patience=2)

In [48]:
wandb.agent(sweep_id, start, count=15)

[34m[1mwandb[0m: Agent Starting Run: tali5fso with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 128
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 5


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 26.97s | train_loss: 6.579 | valid_loss: 4.032 | valid_f1_score: 0.543
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 28.46s | train_loss: 3.262 | valid_loss: 2.873 | valid_f1_score: 0.623
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 28.29s | train_loss: 2.265 | valid_loss: 2.613 | valid_f1_score: 0.674
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▁▁▁▁
valid/batch_f1,▁▄▇▇▇████
valid/batch_loss,█▄▃▂▁▁▁▁▁
valid/epoch_loss,█▄▃▂▁▁▁▁▁
valid/f1_score,▁▄▆▇▇████

0,1
train/batch_loss,0.81441
train/epoch_loss,0.8054
valid/batch_f1,0.78479
valid/batch_loss,2.50398
valid/epoch_loss,2.30437
valid/f1_score,0.70963


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ty5jwkj6 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 128
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 6


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 27.81s | train_loss: 6.575 | valid_loss: 4.129 | valid_f1_score: 0.544
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 28.43s | train_loss: 3.209 | valid_loss: 2.975 | valid_f1_score: 0.615
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 28.29s | train_loss: 2.240 | valid_loss: 2.470 | valid_f1_score: 0.652
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▆▅▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▂▁▁▁▁▁
valid/batch_f1,▁▄▅▇▇▇▇█▇█▇
valid/batch_loss,█▄▂▁▁▁▁▁▁▂▂
valid/epoch_loss,█▄▂▁▁▁▁▁▁▂▂
valid/f1_score,▁▄▅▆▇▇▇█▇██

0,1
train/batch_loss,0.64932
train/epoch_loss,0.65253
valid/batch_f1,0.78439
valid/batch_loss,2.55786
valid/epoch_loss,2.33372
valid/f1_score,0.70693


[34m[1mwandb[0m: Agent Starting Run: q0m8kkqd with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 128
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 4


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 30.68s | train_loss: 6.477 | valid_loss: 4.095 | valid_f1_score: 0.533
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 31.82s | train_loss: 3.205 | valid_loss: 2.822 | valid_f1_score: 0.626
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 31.96s | train_loss: 2.242 | valid_loss: 2.482 | valid_f1_score: 0.648
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▁▁▁▁▁▁▁
valid/batch_f1,▁▅▆▇████████
valid/batch_loss,█▃▂▁▁▁▁▁▁▁▂▂
valid/epoch_loss,█▃▂▂▁▁▁▁▁▁▂▂
valid/f1_score,▁▅▆▇▇███████

0,1
train/batch_loss,0.71926
train/epoch_loss,0.71846
valid/batch_f1,0.77994
valid/batch_loss,2.56982
valid/epoch_loss,2.33266
valid/f1_score,0.7077


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: s5xajp0t with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 128
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 5


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 27.14s | train_loss: 6.622 | valid_loss: 3.962 | valid_f1_score: 0.527
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 28.90s | train_loss: 3.226 | valid_loss: 2.830 | valid_f1_score: 0.613
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 28.80s | train_loss: 2.267 | valid_loss: 2.544 | valid_f1_score: 0.654
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▆▅▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▂▁▁▁▁▁▁
valid/batch_f1,▁▄▆▇▆██▇████
valid/batch_loss,█▃▂▁▁▁▁▁▂▂▂▂
valid/epoch_loss,█▃▂▁▁▁▁▁▂▂▂▂
valid/f1_score,▁▄▆▇▇███████

0,1
train/batch_loss,0.60975
train/epoch_loss,0.60787
valid/batch_f1,0.78862
valid/batch_loss,2.69122
valid/epoch_loss,2.4627
valid/f1_score,0.71025


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qgsqxcks with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 128
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 6


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 27.07s | train_loss: 6.678 | valid_loss: 4.118 | valid_f1_score: 0.516
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 28.38s | train_loss: 3.215 | valid_loss: 2.837 | valid_f1_score: 0.623
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 28.96s | train_loss: 2.286 | valid_loss: 2.577 | valid_f1_score: 0.642
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▆▅▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▂▁▁▁▁▁▁
valid/batch_f1,▁▅▅▇▇▇██████
valid/batch_loss,█▃▂▁▁▁▁▂▂▂▂▂
valid/epoch_loss,█▃▂▁▁▁▁▂▂▂▂▂
valid/f1_score,▁▅▅▇▇▇██████

0,1
train/batch_loss,0.5866
train/epoch_loss,0.58236
valid/batch_f1,0.79097
valid/batch_loss,2.81084
valid/epoch_loss,2.5736
valid/f1_score,0.71359


[34m[1mwandb[0m: Agent Starting Run: l8j5frmu with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 4


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 33.82s | train_loss: 7.095 | valid_loss: 3.902 | valid_f1_score: 0.543
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 33.40s | train_loss: 3.127 | valid_loss: 2.941 | valid_f1_score: 0.612
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 31.30s | train_loss: 2.132 | valid_loss: 2.472 | valid_f1_score: 0.668
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▁▁▁▁▁▁
valid/batch_f1,▁▄▆▇▇██████
valid/batch_loss,█▄▂▁▁▁▁▁▁▂▂
valid/epoch_loss,█▄▂▁▁▁▁▁▂▂▂
valid/f1_score,▁▄▆▇▇██████

0,1
train/batch_loss,0.61036
train/epoch_loss,0.60651
valid/batch_f1,0.78866
valid/batch_loss,2.6266
valid/epoch_loss,2.417
valid/f1_score,0.7159


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: q5v315lv with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 5


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 27.73s | train_loss: 7.261 | valid_loss: 3.684 | valid_f1_score: 0.558
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 29.44s | train_loss: 3.062 | valid_loss: 2.741 | valid_f1_score: 0.636
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 29.32s | train_loss: 2.129 | valid_loss: 2.294 | valid_f1_score: 0.671
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▁▁▁▁▁▁▁
valid/batch_f1,▁▄▆▆▇███████
valid/batch_loss,█▄▂▂▁▁▁▁▂▂▂▂
valid/epoch_loss,█▄▂▂▁▁▁▁▂▂▂▂
valid/f1_score,▁▄▆▆▇███████

0,1
train/batch_loss,0.47182
train/epoch_loss,0.47019
valid/batch_f1,0.80422
valid/batch_loss,2.66585
valid/epoch_loss,2.44002
valid/f1_score,0.7285


[34m[1mwandb[0m: Agent Starting Run: shwuocf1 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 6


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 32.56s | train_loss: 7.024 | valid_loss: 3.964 | valid_f1_score: 0.540
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 32.24s | train_loss: 3.096 | valid_loss: 2.731 | valid_f1_score: 0.613
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 32.23s | train_loss: 2.131 | valid_loss: 2.375 | valid_f1_score: 0.654
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▂▁▁▁▁▁▁
valid/batch_f1,▁▄▅▆▆▇██████
valid/batch_loss,█▃▁▁▁▁▁▁▂▂▂▂
valid/epoch_loss,█▃▁▁▁▁▁▁▂▂▂▂
valid/f1_score,▁▄▅▆▇▇██████

0,1
train/batch_loss,0.42927
train/epoch_loss,0.43321
valid/batch_f1,0.80378
valid/batch_loss,2.84156
valid/epoch_loss,2.60851
valid/f1_score,0.72233


[34m[1mwandb[0m: Agent Starting Run: jd9vykc9 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 4


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 38.30s | train_loss: 6.921 | valid_loss: 4.014 | valid_f1_score: 0.501
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 42.77s | train_loss: 3.108 | valid_loss: 2.830 | valid_f1_score: 0.641
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 33.49s | train_loss: 2.107 | valid_loss: 2.524 | valid_f1_score: 0.655
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▁▁▁▁▁▁▁
valid/batch_f1,▁▅▆▇████████
valid/batch_loss,█▃▂▁▁▁▂▂▂▂▂▂
valid/epoch_loss,█▄▂▁▁▁▂▂▂▂▂▂
valid/f1_score,▁▆▆▇████████

0,1
train/batch_loss,0.51111
train/epoch_loss,0.50751
valid/batch_f1,0.78181
valid/batch_loss,2.76737
valid/epoch_loss,2.5251
valid/f1_score,0.7062


[34m[1mwandb[0m: Agent Starting Run: oyn0bqae with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 5


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 32.84s | train_loss: 6.810 | valid_loss: 4.107 | valid_f1_score: 0.555
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 32.51s | train_loss: 3.080 | valid_loss: 2.748 | valid_f1_score: 0.652
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 32.92s | train_loss: 2.127 | valid_loss: 2.384 | valid_f1_score: 0.654
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▁▁▁▁▁
valid/batch_f1,▁▅▅▇▇█████
valid/batch_loss,█▃▂▁▁▁▂▂▂▂
valid/epoch_loss,█▃▂▁▁▁▂▂▂▂
valid/f1_score,▁▅▅▇▇█████

0,1
train/batch_loss,0.53757
train/epoch_loss,0.54249
valid/batch_f1,0.79874
valid/batch_loss,2.62397
valid/epoch_loss,2.39894
valid/f1_score,0.72204


[34m[1mwandb[0m: Agent Starting Run: at1eo3nn with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 6


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 32.42s | train_loss: 6.702 | valid_loss: 4.250 | valid_f1_score: 0.505
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 31.93s | train_loss: 3.110 | valid_loss: 2.803 | valid_f1_score: 0.636
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 31.14s | train_loss: 2.136 | valid_loss: 2.377 | valid_f1_score: 0.681
----------------------------------------------------------------------------------------------------
LR: 0.003000
------------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▅▃▃▃▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▂▂▂▁▁▁▁▁
valid/batch_f1,▁▅▇▇▇██████
valid/batch_loss,█▃▁▁▁▁▁▂▂▂▂
valid/epoch_loss,█▃▁▁▁▁▁▁▂▂▂
valid/f1_score,▁▅▇▇▇██████

0,1
train/batch_loss,0.44678
train/epoch_loss,0.45074
valid/batch_f1,0.7963
valid/batch_loss,2.83575
valid/epoch_loss,2.58505
valid/f1_score,0.71758


[34m[1mwandb[0m: Agent Starting Run: 5h9y1403 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 4


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 43.28s | train_loss: 13.574 | valid_loss: 3.938 | valid_f1_score: 0.513
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 43.40s | train_loss: 3.066 | valid_loss: 2.952 | valid_f1_score: 0.629
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 44.81s | train_loss: 2.077 | valid_loss: 2.434 | valid_f1_score: 0.690
----------------------------------------------------------------------------------------------------
LR: 0.003000
-----------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▅▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▂▂▂▁▁▁▁
valid/batch_f1,▁▅▇▇████
valid/batch_loss,█▄▂▁▁▁▁▂
valid/epoch_loss,█▄▂▁▁▁▁▂
valid/f1_score,▁▅▇▇████

0,1
train/batch_loss,0.6282
train/epoch_loss,0.6334
valid/batch_f1,0.79486
valid/batch_loss,2.54668
valid/epoch_loss,2.32027
valid/f1_score,0.72105


[34m[1mwandb[0m: Agent Starting Run: ro79lgid with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 5


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 45.29s | train_loss: 12.166 | valid_loss: 3.960 | valid_f1_score: 0.564
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 44.65s | train_loss: 3.131 | valid_loss: 2.714 | valid_f1_score: 0.625
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 44.96s | train_loss: 2.106 | valid_loss: 2.428 | valid_f1_score: 0.690
----------------------------------------------------------------------------------------------------
LR: 0.003000
-----------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▃▂▂▁▁▁▁▁
valid/batch_f1,▁▃▆▆▆▇██▇
valid/batch_loss,█▃▂▁▁▁▂▂▂
valid/epoch_loss,█▃▂▁▁▁▁▁▂
valid/f1_score,▁▃▆▆▆▇██▇

0,1
train/batch_loss,0.50144
train/epoch_loss,0.49592
valid/batch_f1,0.78965
valid/batch_loss,2.65416
valid/epoch_loss,2.40749
valid/f1_score,0.71702


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: eywhqzwr with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.2
[34m[1mwandb[0m: 	sceduler_step: 6


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 42.67s | train_loss: 12.111 | valid_loss: 4.433 | valid_f1_score: 0.505
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 44.54s | train_loss: 3.127 | valid_loss: 2.904 | valid_f1_score: 0.621
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 45.37s | train_loss: 2.088 | valid_loss: 2.370 | valid_f1_score: 0.677
----------------------------------------------------------------------------------------------------
LR: 0.003000
-----------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▃▂▂▁▁▁▁▁
valid/batch_f1,▁▅▇▇█▇███
valid/batch_loss,█▃▂▁▁▂▁▂▂
valid/epoch_loss,█▃▂▁▁▂▁▂▂
valid/f1_score,▁▅▇▇█▇███

0,1
train/batch_loss,0.467
train/epoch_loss,0.45877
valid/batch_f1,0.78666
valid/batch_loss,2.86782
valid/epoch_loss,2.5983
valid/f1_score,0.71604


[34m[1mwandb[0m: Agent Starting Run: 910lgtxi with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 12
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 0.7
[34m[1mwandb[0m: 	lr: 0.003
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3
[34m[1mwandb[0m: 	sceduler_step: 4


LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   1/12 | time: 46.77s | train_loss: 12.236 | valid_loss: 4.042 | valid_f1_score: 0.567
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   2/12 | time: 49.72s | train_loss: 3.038 | valid_loss: 2.716 | valid_f1_score: 0.633
----------------------------------------------------------------------------------------------------
LR: 0.003000
----------------------------------------------------------------------------------------------------
| epoch   3/12 | time: 48.69s | train_loss: 2.106 | valid_loss: 2.339 | valid_f1_score: 0.677
----------------------------------------------------------------------------------------------------
LR: 0.003000
-----------------------------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▅▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▃▂▂▁▁▁▁▁▁
valid/batch_f1,▁▄▆▆▇▇▇▇█▇
valid/batch_loss,█▃▂▁▁▁▁▂▃▃
valid/epoch_loss,█▃▂▁▁▁▁▂▃▃
valid/f1_score,▁▄▆▆█▇▇▇█▇

0,1
train/batch_loss,0.42795
train/epoch_loss,0.41979
valid/batch_f1,0.79373
valid/batch_loss,2.77466
valid/epoch_loss,2.55026
valid/f1_score,0.72038


## Save the Model

In [None]:
model_save_path = "../../model/my_model_515.pth"

In [None]:
model_save_path = "/content/drive/MyDrive/Colab Notebooks/NLP/nlp2022-hw1/model/my_model_497.pth"

In [None]:
torch.save(model.state_dict(), model_save_path)

```Python
POS

h_params = {
    'vocab_size': len(vocab),
    'vocab_size_pos': len(pos2idx),
    'embed_dim': 300,
    'embed_dim_pos': 50,
    'hidden_dim': 16,
    'num_classes': len(lab2idx),
    'embeddings': pretrained_embeddings,
    'bidirectional': True,
    'num_layers': 3,
    'dropout': 0.5,
    'use_crf': False,  
}

epochs = 10 # number of epochs
lr = 0.001  # learning rate
bacth_size = 32 # batch size for training
clipping = 0.5 

optimizer = Adam

F1 = 0.441
```

### Load the Model

In [None]:
# model.load_state_dict(torch.load(model_save_path, map_location=device))

In [None]:
model

NER_POS_Classifier(
  (embeddings): Embedding(10000, 300, padding_idx=0)
  (lstm): LSTM(300, 300, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
  (embeddings_pos): Embedding(18, 100, padding_idx=0)
  (lstm_pos): LSTM(100, 300, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
  (linear_pos): Linear(in_features=600, out_features=600, bias=True)
  (linear_word): Linear(in_features=600, out_features=600, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (concat): Linear(in_features=1200, out_features=600, bias=True)
  (concat2): Linear(in_features=600, out_features=600, bias=True)
  (concat3): Linear(in_features=600, out_features=600, bias=True)
  (fc1): Linear(in_features=600, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=150, bias=True)
  (fc3): Linear(in_features=150, out_features=150, bias=True)
  (classifier): Linear(in_features=150, out_features=14, bias=True)
  (relu): ReLU()
)

In [None]:
from seqeval.metrics import f1_score, accuracy_score, classification_report, precision_score, recall_score, performance_measure

valid_eval = DataLoader(valid_dataset, batch_size=32, shuffle=False)

y_true_list = []
y_pred_list = []
viterbi_list = []


for (labels, features, pos_features) in valid_eval:
    predictions = model(features, pos_features)
    
    # This happens in training
    viterbi_mask = labels != lab2idx[pad_token]
    viterbi_pred = flat_list(model.crf.viterbi_decode(predictions, mask=viterbi_mask))
    
    labels = labels.view(-1)
    predictions = predictions.view(-1, predictions.shape[-1])
    
    # The following happend in compute loss
    mask = labels != lab2idx[pad_token]
    labels = labels[mask].tolist()

    y_true = [idx2lab[l] for l in labels]
    y_true_list.append(y_true)
    
    viterbi_pred_labels = [idx2lab[l] for l in viterbi_pred]
    viterbi_list.append(viterbi_pred_labels)
        
    predictions = predictions.argmax(1)
    predictions = predictions[mask].tolist()
    
    y_pred = [idx2lab[l] for l in predictions]
    y_pred_list.append(y_pred)
        
print(f"Viterbi: {f1_score(y_true_list, viterbi_list, average='macro')}")
print(f"Argamax: {f1_score(y_true_list, y_pred_list, average='macro')}")
print(f"Accuracy: {accuracy_score(y_true_list, viterbi_list)}")

0.4595238095238095

In [None]:
for idx, sentence in enumerate(valid_sentences):
    print("Sentence True Pred Viterbi")
    for i_token, token in enumerate(sentence):
        print(token, valid_labels[idx][i_token], y_pred_list[idx][i_token], viterbi_list[idx][i_token])
    print()

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from pretty_confusion_matrix import pp_matrix

cf_matrix = confusion_matrix(flat_list(y_true_list), flat_list(viterbi_list), labels=labels)

# get pandas dataframe
df_cm = pd.DataFrame(cf_matrix, index = [i for i in labels],
                     columns = [i for i in labels])
# colormap: see this and choose your more dear
cmap = 'PuRd'
# GnBu, YlGnBu
pp_matrix(df_cm, cmap="cividis", figsize=(13,8), annot=True)

# With POS

In [None]:
class NER_Dataset(Dataset):
    def __init__(self, sentences, sentences_labels, sentences_pos=None, sentences_dep=None):
        
        assert len(sentences) == len(sentences_labels), \
                    "Inputs must be of the same length"
        
#         self.data = []
        
        self.sentences = sentences
        self.labels = sentences_labels
        
        self.sentences_lengths = [len(s) for s in sentences]
            
        self.Y = self._from_sequence_to_idx(sentences_labels, lab2idx, pad_token)
        self.X = self._from_sequence_to_idx(sentences, vocab, pad_token, unk_token)
        
#         self.data = list(zip(self.Y, self.X, self.sentences_lengths))
        
        # POS
        if sentences_pos is not None:

            assert len(sentences) == len(sentences_pos), \
                    "Inputs must be of the same length"
            
            
            self.pos = sentences_pos
            self.X_pos = self._from_sequence_to_idx(sentences_pos, pos2idx, pad_token)
            
#             self.data = list(zip(self.Y, self.X, self.sentences_lengths, self.X_pos))
            
            # Dependencies
            if sentences_dep is not None:
                assert len(sentences) == len(sentences_dep), \
                    "Inputs must be of the same length"
                
                self.dep = sentences_dep
                self.X_dep = self._from_sequence_to_idx(sentences_dep, dep_vocab, pad_token, unk_token)
#                 self.data = list(zip(self.Y, self.X, self.sentences_lengths, self.X_pos, self.X_dep))

    def _from_sequence_to_idx(self, sequences_list, vocab, pad_token, unk_token = None):
        sequences_idx = []
        
        if unk_token is not None: # For words
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token, vocab[unk_token]) for token in sentence])
        else: # For labels
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token) for token in sentence])
        
        return sequences_idx    
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.Y[idx], self.X[idx], self.sentences_lengths[idx], self.X_pos[idx]
#         return self.data[idx]

In [None]:
train_dataset = NER_Dataset(train_sentences, train_labels, train_pos)
valid_dataset = NER_Dataset(valid_sentences, valid_labels, valid_pos)

In [None]:
train_dataset[0]

([1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 6, 1],
 [18, 220, 205, 102, 113, 6, 11206, 4, 8, 144, 6, 2, 257, 212, 2947, 3],
 16,
 [9, 6, 12, 1, 1, 2, 1, 3, 11, 1, 2, 5, 7, 1, 1, 3])

In [None]:
def collate_batch(batch):
    labels_list = []
    features_list = []
    pos_features_list = []

    labels, features, sentences_lengths, pos_features = zip(*batch)
    
    sorted_batch = sorted(zip(labels, features, sentences_lengths, pos_features), 
                          key=lambda p: len(p[0]), reverse=True)
    labels, features, sentence_lengths, pos_features = zip(*sorted_batch)
        
    max_length_in_batch = np.max(sentence_lengths)
    
    # Pad sentences and labels to the length of the longest sequence in the batch
    for idx, feature in enumerate(features):
        features_list.append(pad_sequence(feature, max_length_in_batch, vocab[pad_token]))
        labels_list.append(pad_sequence(labels[idx], max_length_in_batch, lab2idx[pad_token]))
        pos_features_list.append(pad_sequence(pos_features[idx], max_length_in_batch, pos2idx[pad_token]))
    

    labels_tensor = torch.LongTensor(labels_list).to(device)
    features_tensor = torch.LongTensor(features_list).to(device)
    pos_features_tensor = torch.LongTensor(pos_features_list).to(device)


    return labels_tensor, features_tensor, sentence_lengths, pos_features_tensor

In [None]:
from TorchCRF import CRF
class NER_POS_Classifier(nn.Module):
    def __init__(self, h_params):
        super().__init__()
        
        # POS embeddings
        self.embeddings_pos = nn.Embedding(h_params['pos_vocab_size'], 
                                            h_params['pos_embed_dim'],
                                            padding_idx=0)

        
        
        self.lstm_pos = nn.LSTM(h_params['pos_embed_dim'], 
                            h_params['pos_lstm_hidden_dim'], 
                            bidirectional=h_params['bidirectional'],
#                             num_layers=h_params['num_layers'],
                            num_layers=2,
                            dropout=h_params['dropout'] if h_params['num_layers'] > 1 else 0,
                            batch_first=True)
        
        
        pos_lstm_output_dim = h_params['pos_lstm_hidden_dim'] if h_params['bidirectional'] is False \
                                else h_params['pos_lstm_hidden_dim'] * 2
        
        # Fasttext
        self.fast_embeddings = self._from_pretrained_embeddings(h_params['fast_embeddings'],
                                                               h_params['vocab_size'],
                                                               h_params['fast_embed_dim'],
                                                               freeze=h_params['freeze_fast'])
            
            

        # Glove
        self.glove_embeddings = self._from_pretrained_embeddings(h_params['glove_embeddings'],
                                                            h_params['vocab_size'],
                                                            h_params['glove_embed_dim'],
                                                            freeze=h_params['freeze_glove'])
        
        lstm_input_dim = h_params['fast_embed_dim'] + h_params['glove_embed_dim'] + pos_lstm_output_dim
        
        # LSTM Word embeddings
        self.lstm = nn.LSTM(lstm_input_dim, 
                            h_params['lstm_hidden_dim'], 
                            bidirectional=h_params['bidirectional'],
                            num_layers=h_params['num_layers'],
                            dropout=h_params['dropout'] if h_params['num_layers'] > 1 else 0,
                            batch_first=True)
        
        
        lstm_output_dim = h_params['lstm_hidden_dim'] if h_params['bidirectional'] is False \
                            else h_params['lstm_hidden_dim'] * 2
        
        
#         self.linear_word = nn.Linear(lstm_output_dim, lstm_output_dim)
        
        self.dropout = nn.Dropout(h_params['dropout'])  

        self.concat = nn.Linear(lstm_output_dim, lstm_output_dim)
#         self.fc1 = nn.Linear(lstm_output_dim, lstm_output_dim)

        self.classifier = nn.Linear(lstm_output_dim, h_params['num_classes'])
        
        self.relu = nn.LeakyReLU()
        
        if h_params['use_crf']:
            self.crf = CRF(h_params['num_classes'])

        self._init_linear_weights()
        
        
    def forward(self, x, x_lengths, x_pos):
        
        x_pos = self.embeddings_pos(x_pos)
        x_pos = self.dropout(x_pos)

        x_pos_dep, _ = self.lstm_pos(x_pos)
        
        
        x_fast = self.fast_embeddings(x)
        x_glove = self.glove_embeddings(x)
        
        x = torch.cat((x_fast, x_glove, x_pos), dim=2)  
        x = self.dropout(x)
        
        x, _ = self.lstm(x)
        x = self.relu(x)
        
        x = self.concat(x)
        x = self.relu(x)

        output = self.classifier(x)

        return output


    def _from_pretrained_embeddings(self, pretrained_embeddings, vocab_size, embed_dim, freeze: bool):
        embeddings = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Get emebeddings from pretrained ones
        embeddings.weight.data.copy_(pretrained_embeddings)
        
        # Freeze embeddings
        embeddings.weight.requires_grad = not freeze 
        
        return embeddings
    
    def _init_linear_weights(self):
        initrange = 0.5

        self.concat.weight.data.uniform_(-initrange, initrange)
        self.concat.bias.data.zero_()

        
        self.classifier.weight.data.uniform_(-initrange, initrange)
        self.classifier.bias.data.zero_()

In [None]:
def train(model, dataloader, h_params, optimizer, criterion, grad_clipping):
    model.train()    
    running_loss = 0.0
    f1_score = 0.0
    viterbi_pred = None
    
    for idx, (labels, features, sentences_lengths, pos_features) in enumerate(dataloader): 
        # Empty gradients
        optimizer.zero_grad()

        # Forward
        predicted_labels = model(features, sentences_lengths, pos_features)

        
        if h_params['use_crf']:
            mask = (labels != lab2idx[pad_token])
            
            log_likelihood = model.crf(predicted_labels, labels, mask=mask)
            # Predictions
            viterbi_pred = flat_list(model.crf.viterbi_decode(predicted_labels, mask=mask))        
                
                
            # The log likelihood is not normalized 
            # (It is not divided by the batch size and it is negative)
            loss = torch.mean(log_likelihood) * -1

            predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
            labels = labels.view(-1)
            
        else:
            predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
            labels = labels.view(-1)
            loss = criterion(predicted_labels, labels)
            

         # Backward  
        loss.backward()
        
        # Gradient Clipping to prevent exploding gradients
        if grad_clipping is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clipping)
        # Update weights 
        optimizer.step()
        
        
        running_loss += loss.item()
#         f1_score += compute_score(predicted_labels, labels, viterbi_pred)
            
    # Loss at the end of the epoch 
    return running_loss/len(dataloader), f1_score/len(dataloader)

In [None]:
def evaluate(model, dataloader, h_params, criterion):
    model.eval()
    valid_loss = 0.0
    f1_score = 0.0
    viterbi_pred = None
    
    with torch.no_grad():
        for idx, (labels, features, sentences_lengths, pos_features) in enumerate(dataloader):
            predicted_labels = model(features, sentences_lengths, pos_features)
            
            if h_params['use_crf']:
                mask = (labels != lab2idx[pad_token])
                log_likelihood = model.crf(predicted_labels, labels, mask=mask) 
                
                viterbi_pred = flat_list(model.crf.viterbi_decode(predicted_labels, mask=mask))        
                
                
                # The log likelihood is not normalized 
                # (It is not divided by the batch size and it is negative)
                loss = torch.mean(log_likelihood) * -1

                predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
                labels = labels.view(-1)
                

            else:
                predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
                labels = labels.view(-1)
                loss = criterion(predicted_labels, labels)
                
               
            valid_loss += loss.item()
            f1_score += compute_score(predicted_labels, labels, viterbi_pred)
            
    return valid_loss/len(dataloader), f1_score/len(dataloader)

In [None]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'valid/f1_score',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    'lstm_hidden_dim': {
        'values': [128, 256, 512]
        },

#     'dropout': {
#         'distribution': 'uniform',
#         'min': 0.4,
#         'max': 0.6,
#     },
    'dropout': {
        "values": [.3, .4, .5]
    },
    
    'lr': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.0008,
        'max': 0.009,
      },

    
    'grad_clipping': {
         # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.5,
        'max': 2.5
    },
    
    "batch_size": {
            'values': [64, 128, 256]
        },

    "epochs": {
        'values': [15]
    },

    # "freeze_fast": {
    #     "values": [True, False]
    # },
    # "freeze_glove": {
    #     "values": [True, False]
    # },
    "freeze_fast": {
        "values": [True, False]
    },
    "freeze_glove": {
        "values": [True, False]
    },

    "sceduler_step": {
        'distribution': 'int_uniform',
        'min': 5,
        'max': 7
    },
    "sceduler_gamma": {
        'distribution': 'uniform',
        'min': 0.1,
        'max': 0.6
    },
    
    "num_layers": {
        "values": [2,3, 4]
    },
    'pos_lstm_hidden_dim': {
        'values': [128, 256, 512]
    },
    'pos_embed_dim': {
        'values': [50, 100, 150]
    },

    
}

sweep_config['parameters'] = parameters_dict

In [None]:
sweep_id = wandb.sweep(sweep_config, project="NLP_HW01_pos")

Create sweep with ID: jrfqny0i
Sweep URL: https://wandb.ai/florin-ml/NLP_HW01_pos/sweeps/jrfqny0i


In [None]:
import multiprocessing
def start():

    wandb.init(project="NLP_HW01_dep")
    config = wandb.config

    h_params = {
        'pos_vocab_size': len(pos2idx),
        'pos_embed_dim': config.pos_embed_dim,
        'pos_lstm_hidden_dim': config.pos_lstm_hidden_dim,
        'vocab_size': len(vocab),
        'fast_embed_dim': 300,
        'freeze_fast': config.freeze_fast,
        'glove_embed_dim': 100,
        'freeze_glove': config.freeze_glove,
        'lstm_hidden_dim': config.lstm_hidden_dim, 
        'num_classes': len(lab2idx),
        'fast_embeddings': fast_pretrained_embeddings,
        'glove_embeddings': glove_pretrained_embeddings,
        'bidirectional': True,
        'num_layers': config.num_layers,
        'dropout': config.dropout,
        'use_crf': True,  # set to true to test with the Conditional Random Field
    }

    model = NER_POS_Classifier(h_params).to(device)

    # Hyperparameters
    epochs = config.epochs # number of epochs
    lr = config.lr # learning rate .005
    bacth_size = config.batch_size # batch size for training 64
    grad_clipping = config.grad_clipping # for clipping gradients         
        
                                                # ignore the padding class
    criterion = torch.nn.CrossEntropyLoss(ignore_index=lab2idx[pad_token]).to(device)
    # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.95)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     scheduler = None
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config['sceduler_step'], gamma=config['sceduler_gamma'])
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[6, 9, 13, 20, 22], gamma=0.1)
    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.002, max_lr=0.01)

    train_dataloader = DataLoader(train_dataset, batch_size=bacth_size, 
                                collate_fn=collate_batch, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=bacth_size, 
                                collate_fn=collate_batch, shuffle=False)

    histories = train_model(model, train_dataloader, valid_dataloader, h_params, 
                            optimizer, criterion, scheduler, grad_clipping, epochs,
                            early_stopping=True, early_stopping_mode="max", 
                            early_stopping_patience=2)

In [None]:
wandb.agent(sweep_id, start, count=1)

[34m[1mwandb[0m: Agent Starting Run: 7bk7q85s with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: False
[34m[1mwandb[0m: 	freeze_glove: False
[34m[1mwandb[0m: 	grad_clipping: 1.2277695981149055
[34m[1mwandb[0m: 	lr: 0.0012255320716480463
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	pos_embed_dim: 50
[34m[1mwandb[0m: 	pos_lstm_hidden_dim: 256
[34m[1mwandb[0m: 	sceduler_gamma: 0.5071660951645122
[34m[1mwandb[0m: 	sceduler_step: 7





VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: [32m[41mERROR[0m Run 7bk7q85s errored: RuntimeError('input.size(-1) must be equal to input_size. Expected 912, got 450')


# With DEP

In [38]:
class NER_DEP_Dataset(Dataset):
    def __init__(self, sentences, sentences_labels, sentences_heads,  sentences_dep):

        assert len(sentences) == len(sentences_labels) == len(sentences_dep), \
                "Inputs must be of the same length"
        
        self.sentences = sentences
        self.labels = sentences_labels
        self.heads = sentences_heads
        self.dep = sentences_dep
        self.sentences_lengths = [len(s) for s in sentences]

        
        self.Y = self._from_sequence_to_idx(sentences_labels, lab2idx, pad_token)
        self.X = self._from_sequence_to_idx(sentences, vocab, pad_token, unk_token)
        
        # DEP
        self.X_heads = self._from_sequence_to_idx(sentences_heads, vocab, pad_token, unk_token)
        self.X_dep = self._from_sequence_to_idx(sentences_dep, dep_vocab, pad_token, unk_token)
    
    def _from_sequence_to_idx(self, sequences_list, vocab, pad_token, unk_token = None):
        sequences_idx = []
        
        if unk_token is not None: # For words
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token, vocab[unk_token]) for token in sentence])
        else: # For labels
            for sentence in sequences_list:
                sequences_idx.append([vocab.get(token) for token in sentence])
        
        return sequences_idx    
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return self.Y[idx], self.X[idx], self.sentences_lengths[idx], self.X_heads[idx], self.X_dep[idx]

In [39]:
train_dataset = NER_DEP_Dataset(train_sentences, train_labels, train_heads, train_dep)
valid_dataset = NER_DEP_Dataset(valid_sentences, valid_labels, valid_heads, valid_dep)

In [40]:
def collate_batch(batch):
    labels_list = []
    features_list = []
    heads_features_list = []
    dep_features_list = []

    labels, features, sentences_lengths, heads_features, dep_features = zip(*batch)
    
    sorted_batch = sorted(zip(labels, features, sentences_lengths, heads_features, dep_features), 
                          key=lambda p: len(p[0]), reverse=True)
    labels, features, sentence_lengths, heads_features, dep_features = zip(*sorted_batch)
        
    max_length_in_batch = np.max(sentence_lengths)
    
    # Pad sentences and labels to the length of the longest sequence in the batch
    for idx, feature in enumerate(features):
        features_list.append(pad_sequence(feature, max_length_in_batch, vocab[pad_token]))
        labels_list.append(pad_sequence(labels[idx], max_length_in_batch, lab2idx[pad_token]))
        heads_features_list.append(pad_sequence(heads_features[idx], max_length_in_batch, vocab[pad_token]))
        dep_features_list.append(pad_sequence(dep_features[idx], max_length_in_batch, dep_vocab[pad_token]))
        

    labels_tensor = torch.LongTensor(labels_list).to(device)
    features_tensor = torch.LongTensor(features_list).to(device)
    dep_features_tensor = torch.LongTensor(dep_features_list).to(device)
    heads_features_tensor = torch.LongTensor(heads_features_list).to(device)
   

    return labels_tensor, features_tensor, sentence_lengths, heads_features_tensor, dep_features_tensor

In [74]:
from TorchCRF import CRF
class NER_DEP_Classifier(nn.Module):
    def __init__(self, h_params):
        super().__init__()

        # Dependency embeddings
        self.embeddings_dep = nn.Embedding(h_params['dep_vocab_size'], 
                                            h_params['dep_embed_dim'],
                                            padding_idx=0)
        
        
        
#         self.lstm_pos_dep = nn.LSTM(h_params['pos_embed_dim'] + h_params['dep_embed_dim'], 
#                             h_params['pos_dep_lstm_hidden_dim'], 
#                             bidirectional=h_params['bidirectional'],
# #                             num_layers=h_params['num_layers'],
#                             num_layers=2,
#                             dropout=h_params['dropout'] if h_params['num_layers'] > 1 else 0,
#                             batch_first=True)
        
        
#         pos_dep_lstm_output_dim = h_params['pos_dep_lstm_hidden_dim'] if h_params['bidirectional'] is False \
#                                 else h_params['pos_dep_lstm_hidden_dim'] * 2
        
        # Fasttext
        # self.fast_embeddings = self._from_pretrained_embeddings(h_params['fast_embeddings'],
        #                                                        h_params['vocab_size'],
        #                                                        h_params['fast_embed_dim'],
        #                                                        freeze=h_params['freeze_fast'])
            
            

        # Glove
        # self.glove_embeddings = self._from_pretrained_embeddings(h_params['glove_embeddings'],
        #                                                     h_params['vocab_size'],
        #                                                     h_params['glove_embed_dim'],
        #                                                     freeze=h_params['freeze_glove'])

        self.word_embeddings = nn.Embedding(h_params['vocab_size'], 
                                            h_params['word_embed_dim'],
                                            padding_idx=0)
        
#         lstm_input_dim = h_params['fast_embed_dim'] + h_params['glove_embed_dim'] + h_params['dep_embed_dim']

        # lstm_input_dim = h_params['fast_embed_dim'] * 2 + h_params['glove_embed_dim'] * 2 + h_params['dep_embed_dim']
        lstm_input_dim = h_params['word_embed_dim'] * 2 + h_params['dep_embed_dim']
        
        # LSTM Word embeddings
        self.lstm = nn.LSTM(lstm_input_dim, 
                            h_params['lstm_hidden_dim'], 
                            bidirectional=h_params['bidirectional'],
                            num_layers=h_params['num_layers'],
                            dropout=h_params['dropout'] if h_params['num_layers'] > 1 else 0,
                            batch_first=True)
        
        
        lstm_output_dim = h_params['lstm_hidden_dim'] if h_params['bidirectional'] is False \
                            else h_params['lstm_hidden_dim'] * 2
        
        
#         self.linear_word = nn.Linear(lstm_output_dim, lstm_output_dim)
        
        self.dropout = nn.Dropout(h_params['dropout'])  

        self.concat = nn.Linear(lstm_output_dim, lstm_output_dim)
        # self.fc1 = nn.Linear(lstm_output_dim, lstm_output_dim)

        self.classifier = nn.Linear(lstm_output_dim, h_params['num_classes'])
        
        self.relu = nn.LeakyReLU()
        
        if h_params['use_crf']:
            self.crf = CRF(h_params['num_classes'])

        self._init_linear_weights()
        
        
    def forward(self, x, x_lengths, x_heads, x_dep):
        
        # x_fast = self.fast_embeddings(x)
        # x_glove = self.glove_embeddings(x)
        
        # x_fast_heads = self.fast_embeddings(x_heads)
        # x_glove_heads = self.glove_embeddings(x_heads)
        
        x_word = self.word_embeddings(x)
        x_heads = self.word_embeddings(x_heads)
        x_dep = self.embeddings_dep(x_dep)
        
        
        x = torch.cat((x_word, x_heads, x_dep), dim=2)  
        x = self.dropout(x)
        
        x, _ = self.lstm(x)
        x = self.relu(x)
        
        x = self.concat(x)
        x = self.relu(x)
        # x = self.fc1(x)
        # x = self.relu(x)

        output = self.classifier(x)

        return output


    def _from_pretrained_embeddings(self, pretrained_embeddings, vocab_size, embed_dim, freeze: bool):
        embeddings = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # Get emebeddings from pretrained ones
        embeddings.weight.data.copy_(pretrained_embeddings)
        
        # Freeze embeddings
        embeddings.weight.requires_grad = not freeze 
        
        return embeddings
    
    def _init_linear_weights(self):
        initrange = 0.5

        self.concat.weight.data.uniform_(-initrange, initrange)
        self.concat.bias.data.zero_()
        # self.fc1.weight.data.uniform_(-initrange, initrange)
        # self.fc1.bias.data.zero_()

        
        self.classifier.weight.data.uniform_(-initrange, initrange)
        self.classifier.bias.data.zero_()

In [42]:
def train(model, dataloader, h_params, optimizer, criterion, grad_clipping):
    model.train()    
    running_loss = 0.0
    f1_score = 0.0
    viterbi_pred = None
    
    for idx, (labels, features, sentences_lengths, heads_features, dep_features) in enumerate(dataloader): 
        # Empty gradients
        optimizer.zero_grad()

        # Forward
        predicted_labels = model(features, sentences_lengths, heads_features, dep_features)

        
        if h_params['use_crf']:
            mask = (labels != lab2idx[pad_token])
            
            log_likelihood = model.crf(predicted_labels, labels, mask=mask)
            # Predictions
            viterbi_pred = flat_list(model.crf.viterbi_decode(predicted_labels, mask=mask))        
                
                
            # The log likelihood is not normalized 
            # (It is not divided by the batch size and it is negative)
            loss = torch.mean(log_likelihood) * -1

            predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
            labels = labels.view(-1)
            
        else:
            predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
            labels = labels.view(-1)
            loss = criterion(predicted_labels, labels)
            

         # Backward  
        loss.backward()
        
        # Gradient Clipping to prevent exploding gradients
        if grad_clipping is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clipping)
        # Update weights 
        optimizer.step()
        
        
        running_loss += loss.item()
#         f1_score += compute_score(predicted_labels, labels, viterbi_pred)
            
        if idx > 0 and idx % 50 == 0:
            metrics = {"train/batch_loss": running_loss/idx}
            wandb.log(metrics)
            
    # Loss at the end of the epoch 
    return running_loss/len(dataloader), f1_score/len(dataloader)

In [43]:
def evaluate(model, dataloader, h_params, criterion):
    model.eval()
    valid_loss = 0.0
    f1_score = 0.0
    viterbi_pred = None
    
    with torch.no_grad():
        for idx, (labels, features, sentences_lengths, heads_features, dep_features) in enumerate(dataloader):
            predicted_labels = model(features, sentences_lengths, heads_features, dep_features)
            
            if h_params['use_crf']:
                mask = (labels != lab2idx[pad_token])
                log_likelihood = model.crf(predicted_labels, labels, mask=mask) 
                
                viterbi_pred = flat_list(model.crf.viterbi_decode(predicted_labels, mask=mask))        
                
                
                # The log likelihood is not normalized 
                # (It is not divided by the batch size and it is negative)
                loss = torch.mean(log_likelihood) * -1

                predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
                labels = labels.view(-1)
                

            else:
                predicted_labels = predicted_labels.view(-1, predicted_labels.shape[-1])
                labels = labels.view(-1)
                loss = criterion(predicted_labels, labels)
                
               
            valid_loss += loss.item()
            f1_score += compute_score(predicted_labels, labels, viterbi_pred)
            
            if idx > 0 and idx % 10 == 0:
                metrics = {"valid/batch_loss": valid_loss/idx, 
                            "valid/batch_f1": f1_score/idx}
                wandb.log(metrics)
            
    return valid_loss/len(dataloader), f1_score/len(dataloader)

In [77]:
sweep_config = {
    'method': 'random'
    }

metric = {
    'name': 'valid/f1_score',
    'goal': 'maximize'   
    }

sweep_config['metric'] = metric

parameters_dict = {
    'lstm_hidden_dim': {
        'values': [128, 256, 512]
        },

#     'dropout': {
#         'distribution': 'uniform',
#         'min': 0.4,
#         'max': 0.6,
#     },
    'dropout': {
        "values": [.4, .5]
    },
    
    'lr': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.003,
        'max': 0.05,
      },

    
    'grad_clipping': {
         # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.5,
        'max': 2.5
    },
    
    "batch_size": {
            'values': [64, 128, 256]
        },

    "epochs": {
        'values': [15]
    },

    # "freeze_fast": {
    #     "values": [True, False]
    # },
    # "freeze_glove": {
    #     "values": [True, False]
    # },
    "freeze_fast": {
        "values": [True, False]
    },
    "freeze_glove": {
        "values": [True]
    },

    "sceduler_step": {
        'distribution': 'int_uniform',
        'min': 4,
        'max': 7
    },
    "sceduler_gamma": {
        'distribution': 'uniform',
        'min': 0.1,
        'max': 0.6
    },
    
    "num_layers": {
        "values": [2,3]
    },
    # 'pos_dep_lstm_hidden_dim': {
    #     'values': [128, 256, 512]
    # },
    # 'pos_embed_dim': {
    #     'values': [50, 100, 150]
    # },
    'dep_embed_dim': {
        'values': [50, 80, 100]
    }
    
}

sweep_config['parameters'] = parameters_dict

In [78]:
sweep_id = wandb.sweep(sweep_config, project="NLP_HW01_heads")

Create sweep with ID: xxxaknqt
Sweep URL: https://wandb.ai/florin-ml/NLP_HW01_heads/sweeps/xxxaknqt


In [79]:
import multiprocessing
def start():

    wandb.init(project="NLP_HW01_heads")
    config = wandb.config

    h_params = {
        # 'pos_vocab_size': len(pos2idx),
        # 'pos_embed_dim': config.pos_embed_dim,
        'word_embed_dim': 300,
        'dep_vocab_size': len(dep_vocab),
        'dep_embed_dim': config.dep_embed_dim,
        # 'pos_dep_lstm_hidden_dim': 200,
        'vocab_size': len(vocab),
        'fast_embed_dim': 300,
        'freeze_fast': config.freeze_fast,
        'glove_embed_dim': 100,
        'freeze_glove': config.freeze_glove,
        'lstm_hidden_dim': config.lstm_hidden_dim, 
        'num_classes': len(lab2idx),
        'fast_embeddings': fast_pretrained_embeddings,
        'glove_embeddings': glove_pretrained_embeddings,
        'bidirectional': True,
        'num_layers': config.num_layers,
        'dropout': config.dropout,
        'use_crf': True,  # set to true to test with the Conditional Random Field
    }

    model = NER_DEP_Classifier(h_params).to(device)

    # Hyperparameters
    epochs = config.epochs # number of epochs
    lr = config.lr # learning rate .005
    bacth_size = config.batch_size # batch size for training 64
    grad_clipping = config.grad_clipping # for clipping gradients         
        
                                                # ignore the padding class
    criterion = torch.nn.CrossEntropyLoss(ignore_index=lab2idx[pad_token]).to(device)
    # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.95)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     scheduler = None
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config['sceduler_step'], gamma=config['sceduler_gamma'])
    # scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[6, 9, 13, 20, 22], gamma=0.1)
    # scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.002, max_lr=0.01)

    train_dataloader = DataLoader(train_dataset, batch_size=bacth_size, 
                                collate_fn=collate_batch, shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=bacth_size, 
                                collate_fn=collate_batch, shuffle=False)

    histories = train_model(model, train_dataloader, valid_dataloader, h_params, 
                            optimizer, criterion, scheduler, grad_clipping, epochs,
                            early_stopping=True, early_stopping_mode="max", 
                            early_stopping_patience=2)

In [None]:
wandb.agent(sweep_id, start, count=10)

[34m[1mwandb[0m: Agent Starting Run: 4hob6eiq with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dep_embed_dim: 50
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: True
[34m[1mwandb[0m: 	grad_clipping: 2.2733687257998825
[34m[1mwandb[0m: 	lr: 0.02197182725552028
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	sceduler_gamma: 0.24717601569619063
[34m[1mwandb[0m: 	sceduler_step: 5


Before: [0.02197182725552028]
After: [0.02197182725552028]
----------------------------------------------------------------------------------------------------
| epoch   1/15 | time: 46.87s | train_loss: 56.061 | valid_loss: 5.026 | valid_f1_score: 0.230
----------------------------------------------------------------------------------------------------
Before: [0.02197182725552028]
After: [0.02197182725552028]
----------------------------------------------------------------------------------------------------
| epoch   2/15 | time: 47.06s | train_loss: 4.771 | valid_loss: 4.876 | valid_f1_score: 0.290
----------------------------------------------------------------------------------------------------
Before: [0.02197182725552028]
After: [0.02197182725552028]
----------------------------------------------------------------------------------------------------
| epoch   3/15 | time: 46.38s | train_loss: 4.273 | valid_loss: 4.168 | valid_f1_score: 0.323
-----------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▅▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid/batch_f1,▁▃▄▅▅▇▇▇▇██████
valid/batch_loss,█▇▃▄▂▂▃▂▂▁▁▁▁▁▁
valid/epoch_loss,█▇▃▄▂▂▃▂▂▁▁▁▁▁▁
valid/f1_score,▁▃▄▅▅▇▇▇▇██████

0,1
train/batch_loss,3.02769
train/epoch_loss,3.00277
valid/batch_f1,0.48461
valid/batch_loss,4.05628
valid/epoch_loss,3.7394
valid/f1_score,0.4339


[34m[1mwandb[0m: Agent Starting Run: 9obzjvle with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dep_embed_dim: 80
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: True
[34m[1mwandb[0m: 	grad_clipping: 1.6456588506393135
[34m[1mwandb[0m: 	lr: 0.0384939920519963
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	sceduler_gamma: 0.1791358344785207
[34m[1mwandb[0m: 	sceduler_step: 4


Before: [0.0384939920519963]
After: [0.0384939920519963]
----------------------------------------------------------------------------------------------------
| epoch   1/15 | time: 24.47s | train_loss: 52.246 | valid_loss: 4.903 | valid_f1_score: 0.243
----------------------------------------------------------------------------------------------------
Before: [0.0384939920519963]
After: [0.0384939920519963]
----------------------------------------------------------------------------------------------------
| epoch   2/15 | time: 24.32s | train_loss: 4.868 | valid_loss: 4.663 | valid_f1_score: 0.274
----------------------------------------------------------------------------------------------------
Before: [0.0384939920519963]
After: [0.0384939920519963]
----------------------------------------------------------------------------------------------------
| epoch   3/15 | time: 24.69s | train_loss: 4.522 | valid_loss: 4.632 | valid_f1_score: 0.267
-----------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▁▁▁▁▁▁▁▁▁
valid/epoch_loss,█▆▆▃▃▃▂▂▁▁
valid/f1_score,▁▃▂▄▆▇▇▇██

0,1
train/batch_loss,3.59477
train/epoch_loss,3.5546
valid/epoch_loss,4.05756
valid/f1_score,0.36651


[34m[1mwandb[0m: Agent Starting Run: nlp7bii6 with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	dep_embed_dim: 80
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: False
[34m[1mwandb[0m: 	freeze_glove: True
[34m[1mwandb[0m: 	grad_clipping: 1.674387667085155
[34m[1mwandb[0m: 	lr: 0.008020175887470397
[34m[1mwandb[0m: 	lstm_hidden_dim: 512
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.22413838116415755
[34m[1mwandb[0m: 	sceduler_step: 7


Before: [0.008020175887470397]
After: [0.008020175887470397]
----------------------------------------------------------------------------------------------------
| epoch   1/15 | time: 46.82s | train_loss: 107.963 | valid_loss: 8.961 | valid_f1_score: 0.150
----------------------------------------------------------------------------------------------------
Before: [0.008020175887470397]
After: [0.008020175887470397]
----------------------------------------------------------------------------------------------------
| epoch   2/15 | time: 46.38s | train_loss: 7.698 | valid_loss: 5.678 | valid_f1_score: 0.325
----------------------------------------------------------------------------------------------------
Before: [0.008020175887470397]
After: [0.008020175887470397]
----------------------------------------------------------------------------------------------------
| epoch   3/15 | time: 46.82s | train_loss: 5.053 | valid_loss: 4.440 | valid_f1_score: 0.411
----------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid/epoch_loss,█▄▃▂▁▁▁▁▁▁▁▁▁▁▁
valid/f1_score,▁▄▅▆▇▇▇▇███████

0,1
train/batch_loss,1.45594
train/epoch_loss,1.42387
valid/epoch_loss,3.12092
valid/f1_score,0.62


[34m[1mwandb[0m: Agent Starting Run: op1l9ozv with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dep_embed_dim: 50
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: False
[34m[1mwandb[0m: 	freeze_glove: True
[34m[1mwandb[0m: 	grad_clipping: 0.5797313267306425
[34m[1mwandb[0m: 	lr: 0.02031017708460951
[34m[1mwandb[0m: 	lstm_hidden_dim: 128
[34m[1mwandb[0m: 	num_layers: 2
[34m[1mwandb[0m: 	sceduler_gamma: 0.39576837427723854
[34m[1mwandb[0m: 	sceduler_step: 4


Before: [0.02031017708460951]
After: [0.02031017708460951]
----------------------------------------------------------------------------------------------------
| epoch   1/15 | time: 32.47s | train_loss: 7.875 | valid_loss: 4.679 | valid_f1_score: 0.313
----------------------------------------------------------------------------------------------------
Before: [0.02031017708460951]
After: [0.02031017708460951]
----------------------------------------------------------------------------------------------------
| epoch   2/15 | time: 32.62s | train_loss: 4.508 | valid_loss: 4.136 | valid_f1_score: 0.365
----------------------------------------------------------------------------------------------------
Before: [0.02031017708460951]
After: [0.02031017708460951]
----------------------------------------------------------------------------------------------------
| epoch   3/15 | time: 32.75s | train_loss: 4.029 | valid_loss: 4.042 | valid_f1_score: 0.385
------------------------------------

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▆▅▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch_loss,█▄▃▃▂▂▂▂▁▁▁▁▁▁
valid/batch_f1,▁▃▄▄▅▆▆▆▇▇████
valid/batch_loss,█▅▄▄▃▃▂▂▂▁▁▁▁▁
valid/epoch_loss,█▅▄▄▃▃▂▂▁▁▁▁▁▁
valid/f1_score,▁▃▄▄▅▆▆▆▇▇█▇██

0,1
train/batch_loss,2.54662
train/epoch_loss,2.51388
valid/batch_f1,0.55165
valid/batch_loss,3.76796
valid/epoch_loss,3.46737
valid/f1_score,0.49388


[34m[1mwandb[0m: Agent Starting Run: 0c3awfis with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dep_embed_dim: 80
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: False
[34m[1mwandb[0m: 	freeze_glove: True
[34m[1mwandb[0m: 	grad_clipping: 1.4947224804989194
[34m[1mwandb[0m: 	lr: 0.030992680535721875
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.5861788463667293
[34m[1mwandb[0m: 	sceduler_step: 4


Before: [0.030992680535721875]
After: [0.030992680535721875]
----------------------------------------------------------------------------------------------------
| epoch   1/15 | time: 28.85s | train_loss: 65.885 | valid_loss: 9.236 | valid_f1_score: 0.029
----------------------------------------------------------------------------------------------------
Non ci siamo...



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/batch_loss,█▁
train/epoch_loss,▁
valid/epoch_loss,▁
valid/f1_score,▁

0,1
train/batch_loss,73.82886
train/epoch_loss,65.88549
valid/epoch_loss,9.23599
valid/f1_score,0.0295


[34m[1mwandb[0m: Agent Starting Run: gdt9amfw with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dep_embed_dim: 80
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 15
[34m[1mwandb[0m: 	freeze_fast: True
[34m[1mwandb[0m: 	freeze_glove: True
[34m[1mwandb[0m: 	grad_clipping: 0.9751372204077626
[34m[1mwandb[0m: 	lr: 0.012336724404986217
[34m[1mwandb[0m: 	lstm_hidden_dim: 256
[34m[1mwandb[0m: 	num_layers: 3
[34m[1mwandb[0m: 	sceduler_gamma: 0.3054407573284337
[34m[1mwandb[0m: 	sceduler_step: 4


Before: [0.012336724404986217]
After: [0.012336724404986217]
----------------------------------------------------------------------------------------------------
| epoch   1/15 | time: 41.59s | train_loss: 20.296 | valid_loss: 5.657 | valid_f1_score: 0.200
----------------------------------------------------------------------------------------------------
Before: [0.012336724404986217]
After: [0.012336724404986217]
----------------------------------------------------------------------------------------------------
| epoch   2/15 | time: 40.48s | train_loss: 4.883 | valid_loss: 4.645 | valid_f1_score: 0.273
----------------------------------------------------------------------------------------------------
Before: [0.012336724404986217]
After: [0.012336724404986217]
----------------------------------------------------------------------------------------------------
| epoch   3/15 | time: 40.64s | train_loss: 4.128 | valid_loss: 4.327 | valid_f1_score: 0.349
-----------------------------