In [0]:
import ast
import pickle
import random
import pandas as pd
import numpy as np
from copy import deepcopy
from datetime import datetime

import flair
from flair.data import Sentence
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import XLMEmbeddings

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from tqdm import tnrange, tqdm_notebook, tqdm

from sklearn.metrics import f1_score, precision_score, recall_score

In [0]:
# tools for the extraction of pretrained embeddings and linguistic features for each recipe, loading data, and calculation of metrics

def initialize_embeddings(embeddings):
    """given a list of embedding names, initialize pretrained embeddings, stack them, and extract the dimension"""
    
    emb_list = []
    for e in embeddings:
        if e == 'fasttext':
            fasttext_embedding = WordEmbeddings('fr')
            emb_list.append(fasttext_embedding)
        elif e == 'flair':
            flair_forward  = FlairEmbeddings('fr-forward')
            flair_backward = FlairEmbeddings('fr-backward')
            emb_list.append(flair_forward)
            emb_list.append(flair_backward)
        elif e == 'xlm':
            xlm_embedding = XLMEmbeddings('xlm-mlm-enfr-1024')
            emb_list.append(xlm_embedding)
        elif e == 'xlm-multi':
            xlm_embedding_multi = XLMEmbeddings('xlm-mlm-tlm-xnli15-1024', pooling_operation='last')
            emb_list.append(xlm_embedding_multi)
        elif e == 'bert':
            bert_embedding = BertEmbeddings('bert-base-multilingual-cased', layers='-1')
            emb_list.append(bert_embedding)
        elif e == 'camembert':
            return None, 768

    stacked_embeddings = StackedEmbeddings(embeddings = emb_list).eval()
    s = Sentence('this is to extract embedding dimension!')
    stacked_embeddings.embed(s)
    emb_dim = len(s[0].embedding)
    
    return stacked_embeddings, emb_dim
  
  
  
def dataset_creator(data, stacked_embeddings, emb_dim, maxlen):
    """given dataset, initialized pretrained embeddings, and embedding dimension, output embedding vectors"""
    
    dataset = []
    for i in tqdm_notebook(range(len(data))):   
      # empty tensor for words #

        sample = torch.zeros(0,emb_dim).cuda()
            
        text = ''
        if type(data.titre[i]) == str:
            text += data.titre[i]
            text += ', '
        if type(data.preparation[i]) == str:
            text += data.preparation[i]
        
        if embeddings == ['camembert']:
            limited = ' '.join(text.split(' ')[:maxlen])
            tokens = camembert.encode(limited)
            emb = camembert.extract_features(tokens).squeeze().cpu()
            dataset.append((i, emb, emb.shape[0], data.niveau[i], data.plat[i]))

        else:
            sentence = Sentence(' '.join([str(tok).split(' ')[-1] for tok in Sentence(text)[:maxlen]]))
            stacked_embeddings.embed(sentence)
            # for every word #
            for token in sentence:
                # storing word Embeddings of each word in a sentence #
                sample = torch.cat((sample,token.embedding.view(-1,emb_dim)),0)
            dataset.append((i, sample, sample.shape[0], data.niveau[i], data.plat[i]))
    
    return dataset
  
  
  
def add_features(df, data, features):
    """given dataframe, embedding vectors, and a dictionary of features for samples, create samples that contain both embeddings and linguistic features"""
    
    dataset = []
    for i in tqdm_notebook(range(len(data))):   
      # empty tensor for words #

        sample = data[i][1:]
        idx = df.ID[i]
        feat = torch.FloatTensor(features[idx])
        new = (idx, feat, ) + sample

        dataset.append(new)
    
    return dataset
  
  
  
def dataloader(data, batch_size, maxlen, emb_dim):
    """batch the vectorized data"""
    
    batches = []
    
    temp_data = deepcopy(data)
    random.shuffle(temp_data)
    
    for i in range(0, len(data), batch_size):
        batches.append(temp_data[i:i + batch_size])
    
    dl = []
    
    for i in range(len(batches)):
        batches[i] = sorted(batches[i], key=lambda x: x[3], reverse=True)
        dl.append([])
        
        # IDs
        dl[-1].append(torch.tensor([sample[0] for sample in batches[i]]))
        
        # linguistic features
        dl[-1].append(torch.zeros(len(batches[i]), 77))
        for j in range(len(batches[i])):
            dl[-1][-1][j, :] = batches[i][j][1]
        
        # Embeddings
        dl[-1].append(torch.zeros(len(batches[i]), min(maxlen, batches[i][0][3]), emb_dim))
        for j in range(len(batches[i])):
            dl[-1][-1][j, :min(maxlen, batches[i][j][3]), :] = batches[i][j][2][:min(maxlen, batches[i][j][3]), :]
        dl[-1][-1] = dl[-1][-1].permute(1,0,2)
        
        # Lengths
        dl[-1].append(torch.tensor([min(sample[3], maxlen) for sample in batches[i]]))
        
        # Labels
        dl[-1].append(torch.tensor([sample[4] for sample in batches[i]]))
        dl[-1].append(torch.tensor([sample[5] for sample in batches[i]]))
        
    return dl


def average_metrics(true, pred):
    """given golden and predicted labels, output micro and macro scores, and f1 for each label"""
    
    micro_precision = precision_score(true, pred, average='micro')
    micro_recall = recall_score(true, pred, average='micro')
    micro_f1 = f1_score(true, pred, average='micro')
    macro_precision = precision_score(true, pred, average='macro')
    macro_recall = recall_score(true, pred, average='macro')
    macro_f1 = f1_score(true, pred, average='macro')
    new_macro_f1 = (2*macro_precision*macro_recall)/(macro_precision+macro_recall)
    label_wise = f1_score(true, pred, average=None) 

    return(micro_precision, micro_recall, micro_f1, macro_precision, macro_recall, macro_f1, new_macro_f1, label_wise)

In [0]:
# the joint model, composed of a neural and a linguistic sub-model

class Model(nn.Module):
    def __init__(self, emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, emb_dropout):
        super().__init__()
        self.embedding_dim, self.hidden_type, self.n_hidden, self.n_layers, self.pool_type, self.n_out, self.direction, self.dropout, self.emb_dropout = emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, emb_dropout
        self.embedding_dropout = nn.Dropout(p=self.emb_dropout)
        if type(self.pool_type) != list:
            self.pool_type = [self.pool_type]
        
        self.relu = nn.ReLU()
        if use_features:
            self.linear = nn.Linear(num_features, num_features)
        self.dropout1 = nn.Dropout(dropout)
        
        if self.hidden_type == 'GRU-CNN':
            self.number_of_hidden = 64
            self.number_of_layers = 1
          
            self.hidden1 = nn.GRU(input_size=self.embedding_dim, hidden_size=self.number_of_hidden, num_layers=self.number_of_layers, dropout = self.dropout, bidirectional = True)
            self.hidden = []
            for i in range(len(self.n_hidden)):
                self.hidden.append(nn.Conv2d(in_channels=1, out_channels=self.n_hidden[i], kernel_size=(self.n_layers[i], 2*self.number_of_hidden), padding=(self.n_layers[i]//2, 0)))
            self.hidden = nn.ModuleList(self.hidden)
            self.attn = []
            for i in range(len(self.n_hidden)):
                self.attn.append(nn.Linear(self.n_hidden[i], 1))
            self.attn = nn.ModuleList(self.attn)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden), self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden), self.n_out)

        elif self.hidden_type == 'transformer':
            self.hidden = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=n_hidden, nhead=6), num_layers=n_layers)
            self.attn = nn.Linear(self.n_hidden, 1)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden, self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden, self.n_out)
        
        elif self.hidden_type == 'CNN':
            self.hidden = []
            for i in range(len(self.n_hidden)):
                self.hidden.append(nn.Conv2d(in_channels=1, out_channels=self.n_hidden[i], kernel_size=(self.n_layers[i], self.embedding_dim), padding=(self.n_layers[i]//2, 0)))
            self.hidden = nn.ModuleList(self.hidden)
            self.attn = []
            for i in range(len(self.n_hidden)):
                self.attn.append(nn.Linear(self.n_hidden[i], 1))
            self.attn = nn.ModuleList(self.attn)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1)*sum(self.n_hidden), self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden)+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type)*sum(self.n_hidden), self.n_out)
  
        elif self.direction == 'unidirectional':
            if self.hidden_type == 'RNN':
                self.hidden = nn.RNN(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout)
            elif self.hidden_type == 'GRU':
                self.hidden = nn.GRU(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout)
            elif self.hidden_type == 'LSTM':
                self.hidden = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout)
            self.attn = nn.Linear(self.n_hidden, 1)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear((len(self.pool_type)+1) * self.n_hidden, self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(len(self.pool_type) * self.n_hidden, self.n_out)

        elif self.direction == 'bidirectional':
            if self.hidden_type == 'RNN':
                self.hidden = nn.RNN(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout, bidirectional = True)
            elif self.hidden_type == 'GRU':
                self.hidden = nn.GRU(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout, bidirectional = True)
            elif self.hidden_type == 'LSTM':
                self.hidden = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.n_hidden, num_layers=self.n_layers, dropout = self.dropout, bidirectional = True)
                         
            self.attn = nn.Linear(2*self.n_hidden, 1)
            if 'max' in pool_type:
                if use_features:
                    self.out = nn.Linear(2 * (len(self.pool_type)+1) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(2 * (len(self.pool_type)+1) * self.n_hidden, self.n_out)
            else:
                if use_features:
                    self.out = nn.Linear(2 * len(self.pool_type) * self.n_hidden+num_features, self.n_out)
                else:
                    self.out = nn.Linear(2 * len(self.pool_type) * self.n_hidden, self.n_out)
    
    def deactivate_dropout(self):
        self.embedding_dropout = nn.Dropout(p=0)
        
    def activate_dropout(self):
        self.embedding_dropout = nn.Dropout(p=self.emb_dropout)
    
    def init_hidden(self, batch_size, gpu = False):
        
        if self.hidden_type == 'GRU-CNN':
            if gpu:
                return Variable(torch.zeros(2 * self.number_of_layers,batch_size,self.number_of_hidden).cuda())
            else:
                return Variable(torch.zeros(2 * self.number_of_layers,batch_size,self.number_of_hidden))
          
        if self.direction == 'unidirectional':
            if gpu:
                return Variable(torch.zeros(self.n_layers,batch_size,self.n_hidden).cuda())
            else:
                return Variable(torch.zeros(self.n_layers,batch_size,self.n_hidden))
        elif self.direction == 'bidirectional':
            if gpu:
                return Variable(torch.zeros(2 * self.n_layers,batch_size,self.n_hidden).cuda())
            else:
                return Variable(torch.zeros(2 * self.n_layers,batch_size,self.n_hidden))
    
    def attention(self, hidden_out, lengths=None, n=None):
        if n==None:
            attn_out = self.attn(hidden_out)
        else:
            attn_out = self.attn[n](hidden_out)
        if lengths is not None:
            mask = torch.arange(torch.tensor(lengths[0]))[None, :] >= torch.tensor(lengths[:, None])
            attn_out[mask.permute(1,0)] = float('-inf')
        attn_weights = F.softmax(attn_out, 0)
        new_hidden_state = torch.sum(hidden_out * attn_weights, 0)
        return new_hidden_state
    
    def np_attention(self, hidden_out, h):
        hidden = h.squeeze(0)
        attn_weights = torch.bmm(hidden_out, hidden.unsqueeze(2)).squeeze(2)
        soft_attn_weights = F.softmax(attn_weights, 1)
        new_hidden_state = torch.bmm(hidden_out.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return new_hidden_state
    
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
        
    def forward(self, feats, seq, lengths, gpu = False):
        bs = seq.size(1) # batch size
        embs = seq
        
        if self.hidden_type != 'CNN' and self.hidden_type != 'transformer':
            embs = pack_padded_sequence(embs, lengths)
            self.h = self.init_hidden(bs, gpu)
            self.c = self.init_hidden(bs, gpu)
            
        if self.hidden_type == 'RNN':
            hidden_out, self.h = self.hidden(embs, self.h)
        elif self.hidden_type == 'GRU':
            hidden_out, self.h = self.hidden(embs, self.h)
        elif self.hidden_type == 'LSTM':
            hidden_out, (self.h, self.c) = self.hidden(embs, (self.h, self.c))
        elif self.hidden_type == 'CNN':
            hidden_out = []
            for cnn in self.hidden:
                hidden_out.append(cnn(embs.permute(1, 0, 2).unsqueeze(1)).squeeze(-1).permute(2, 0, 1))
        elif self.hidden_type == 'GRU-CNN':
            h_out, self.h = self.hidden1(embs, self.h)
            h_out, lengths = pad_packed_sequence(h_out)
            hidden_out = []
            for cnn in self.hidden:
                hidden_out.append(cnn(h_out.permute(1, 0, 2).unsqueeze(1)).squeeze(-1).permute(2, 0, 1))
            
        if 'CNN' in self.hidden_type:
            avg_pool, max_pool, attn_pool = [], [], []
            for i in range(len(hidden_out)):
                avg_pool.append(F.adaptive_avg_pool1d(hidden_out[i].permute(1,2,0),1).view(seq.size(1),-1))
                max_pool.append(F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out[i], -hidden_out[i]], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1))
                attn_pool.append(self.attention(hidden_out[i], n=i))
                
        elif self.hidden_type == 'transformer':
            padding_mask = torch.arange(torch.tensor(lengths[0]))[None, :] >= torch.tensor(lengths[:, None])
            if gpu:
                padding_mask = padding_mask.cuda()
            hidden_out = self.hidden(embs, src_key_padding_mask=padding_mask)
            avg_pool = F.adaptive_avg_pool1d(hidden_out.permute(1,2,0),1).view(seq.size(1),-1)
            max_pool = F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out, -hidden_out], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1)
            attn_pool = self.attention(hidden_out, lengths=lengths)
        
        else:
            hidden_out, lengths = pad_packed_sequence(hidden_out)
        
            if self.direction == 'unidirectional':
                avg_pool = F.adaptive_avg_pool1d(hidden_out.permute(1,2,0),1).view(seq.size(1),-1)
                max_pool = F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out, -hidden_out], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1)
                np_attn_pool = self.np_attention(hidden_out.permute(1, 0, 2), hidden_out[-1].unsqueeze(0))
                attn_pool = self.attention(hidden_out, lengths=lengths)
                last_pool = hidden_out[-1]
            elif self.direction == 'bidirectional':
                avg_pool = F.adaptive_avg_pool1d(hidden_out.permute(1,2,0),1).view(seq.size(1),-1)
                max_pool = F.adaptive_max_pool1d(self.relu(torch.cat([hidden_out, -hidden_out], dim=-1)).permute(1,2,0),1).view(seq.size(1),-1)
                np_attn_pool = self.np_attention(hidden_out[:, :, :self.n_hidden].permute(1, 0, 2), hidden_out[-1, :, :self.n_hidden].unsqueeze(0))
                np_attn_pool = torch.cat((np_attn_pool, self.np_attention(hidden_out[:, :, self.n_hidden:].permute(1, 0, 2), hidden_out[0, :, self.n_hidden:].unsqueeze(0))), dim=1)
                attn_pool = self.attention(hidden_out, lengths=lengths)
                last_pool = torch.cat((hidden_out[-1, :, :self.n_hidden], hidden_out[0, :, self.n_hidden:]), dim=1)
            
        pool_output = []
        
        if 'CNN' in self.hidden_type:
            for p in self.pool_type:
                if p == 'average':
                    pool_output += avg_pool
                elif p == 'max':
                    pool_output += max_pool
                elif p == 'attention':
                    pool_output += attn_pool
                    
        else:
            for p in self.pool_type:
                if p == 'average':
                    pool_output.append(avg_pool)
                elif p == 'max':
                    pool_output.append(max_pool)
                elif p == 'np_attention':
                    pool_output.append(np_attn_pool)
                elif p == 'attention':
                    pool_output.append(attn_pool)
                elif p == 'last':
                    pool_output.append(last_pool)
           
        pool_output = torch.cat(pool_output, dim=1)

        if use_features:
            feats_linear = self.linear(feats)
            pool_output1 = torch.cat((pool_output, feats_linear), 1)
            pool_output1 = self.dropout1(pool_output1)
            outp = self.out(pool_output1)
        else:
            pool_output1 = self.dropout1(pool_output)
            outp = self.out(pool_output1)
  
        return F.log_softmax(outp, dim=-1), pool_output1

In [0]:
def adjust_lr(learning_rate, opt, epoch):

    """update the learning rate using learning rate decay"""
    if epoch < 3:
        lr = learning_rate
    else:
        lr = 1e-4
      
    for param_group in opt.param_groups:
        param_group['lr'] = lr


def fit(specs, model, train_data, dev_data, loss_fn, opt, epochs=3, batch_size = 1):
    
    name = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#     with open('./path/to/results/folder/' + name + '.txt', 'w') as text_file:
#                 for i in specs:
#                     text_file.write(str(i)+' = '+str(specs[i])+'\n')
#                 text_file.write("\n")
    
    max_f1 = 0
    
    for epoch in tqdm_notebook(range(epochs)):
        
        adjust_lr(learning_rate, opt, epoch)
        
        train_dl = dataloader(train_data, batch_size, maxlen, emb_dim)
        num_batch = len(train_dl)
        y_true_train = list()
        y_pred_train = list()
        total_loss_train = 0
        
        y_true_dev = list()
        y_pred_dev = list()
        total_loss_dev = 0

        model = model.train()
        
        t = tqdm_notebook(iter(train_dl), leave=False, total=num_batch)
        for ids, feats, X, lengths, y1, y2 in t:
            t.set_description(f'Epoch {epoch}')
            
            if gpu:
                feats = feats.cuda()
                X = Variable(X).cuda()
                y = Variable(y1).type(torch.LongTensor).cuda()
            else:
                X = Variable(X)
                y = Variable(y1).type(torch.LongTensor)

            lengths = lengths.numpy()
            
            opt.zero_grad()
            pred, _ = model(feats, X, lengths, gpu=gpu)
            loss = loss_fn(pred, y, weight=train_weights)
            loss.backward()
            if clipping == True:
                clip_grad_norm_(model.parameters(), max_norm=5)
            
            opt.step()
            
            t.set_postfix(loss=loss.data)
            pred_idx = torch.max(pred, dim=1)[1]
            y_true_train.append(y.cpu().data.numpy())
            y_pred_train.append(pred_idx.cpu().data.numpy())
            total_loss_train += loss.data

        y_true_train = np.concatenate(y_true_train, axis=0)
        y_pred_train = np.concatenate(y_pred_train, axis=0)

        train_micro_precision, train_micro_recall, train_micro_f1, train_macro_precision, train_macro_recall, train_macro_f1, train_new_macro_f1, train_list = average_metrics(y_true_train, y_pred_train)
        train_loss = total_loss_train/len(train_dl)
        
        #dev phase begins
        model.eval()
        dev_dl = dataloader(dev_data, batch_size, maxlen, emb_dim)
        for ids, feats, X, lengths, y1, y2 in tqdm_notebook(dev_dl, leave = False):
          
            if gpu:
                feats = feats.cuda()
                X = Variable(X).cuda()
                y = Variable(y1).type(torch.LongTensor).cuda()
            else:
                X = Variable(X)
                y = Variable(y1).type(torch.LongTensor)
              
            pred, _ = model(feats, X, lengths.numpy(), gpu = gpu)
            loss = loss_fn(pred, y, weight=train_weights)
            pred_idx = torch.max(pred, dim=1)[1]
            y_true_dev.append(y.cpu().data.numpy())
            y_pred_dev.append(pred_idx.cpu().data.numpy())
            total_loss_dev += loss.data

        y_true_dev = np.concatenate(y_true_dev, axis=0)
        y_pred_dev = np.concatenate(y_pred_dev, axis=0)
        
        dev_micro_precision, dev_micro_recall, dev_micro_f1, dev_macro_precision, dev_macro_recall, dev_macro_f1, dev_new_macro_f1, dev_list = average_metrics(y_true_dev, y_pred_dev)
        dev_loss = total_loss_dev/len(dev_dl)

        report_string = '-----------------------------------------------------------\n'
        report_string += 'Epoch = %d\n\nTrain:\tLoss = %.3f - Mic_Precision = %.3f - Mic_Recall = %.3f - Mic_F1 = %.3f - Mac_Precision = %.3f - Mac_Recall = %.3f - Mac_F1 = %.3f - New_Mac_F1 = %.3f\n\n' % \
            (epoch+1, train_loss*100, train_micro_precision*100, train_micro_recall*100, train_micro_f1*100, train_macro_precision*100, train_macro_recall*100, train_macro_f1*100, train_new_macro_f1*100)
        
        report_string += 'Dev:\tLoss = %.3f - Mic_Precision = %.3f - Mic_Recall = %.3f - Mic_F1 = %.3f - Mac_Precision = %.3f - Mac_Recall = %.3f - Mac_F1 = %.3f - New_Mac_F1 = %.3f\n\n' % \
            (dev_loss*100, dev_micro_precision*100, dev_micro_recall*100, dev_micro_f1*100, dev_macro_precision*100, dev_macro_recall*100, dev_macro_f1*100, dev_new_macro_f1*100)
        
        print(report_string, end='')
        print('Per Class F1 on Dev:\n')
        print(dev_list)
#         with open('./path/to/results/folder/' + name + ".txt", "a") as text_file:
#             text_file.write(report_string)
            
        if dev_micro_f1 > max_f1:
            max_f1 = dev_micro_f1
            with open('../saved_models/%s-%s-%s-%s.pkl' % (str(specs['embeddings']), str(specs['hidden_type']), str(specs['pool_type']), str(specs['use_features'])), 'wb') as file:
                pickle.dump((model.state_dict(), specs), file)
            print('\nmax_f1 overwritten (%.3f)!' % (max_f1*100))
        else:
            print('\nmax_f1 kept the same (%.3f)!' % (max_f1*100))

In [0]:
# load datasets

train = pd.read_csv('./data/train.csv')
dev = pd.read_csv('./data/dev.csv')

In [0]:
# define hyper-parameters
# if you choose CNN as hidden_type, n_hidden is number of filters and n_layers is filter size. Both should be input as list.

embeddings = ['bert']
hidden_type = 'CNN'
n_hidden = [250, 50]
n_layers = [2, 3]
pool_type = ['max']
direction = 'bidirectional'
embedding_dropout = 0
dropout = 0.2
clipping = True
learning_rate = 1e-3
batch_size = 32
weight_decay = 0
maxlen = 100
requires_grad = False
n_out = 4

use_features = False
gpu = True

if 'camembert' in embeddings:
    camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
    camembert.eval()
    for param in camembert.parameters():
        param.requires_grad = False

In [0]:
#initialize embeddings

stacked_embeddings, emb_dim = initialize_embeddings(embeddings)
print(emb_dim)

In [0]:
# load already saved embeddings

with open('./embeddings/train_emb.pickle', 'rb') as handle:
    train_data = pickle.load(handle)

with open('./embeddings/dev_emb.pickle', 'rb') as handle:
    dev_data = pickle.load(handle)

In [0]:
# extract linguistic features

with open('./features/distrib_features_from_vocab_bestfirst_greedystepwise_moydiff.txt') as f:
    vocab = f.readlines()

del vocab[18]

for i in range(len(vocab)):
    vocab[i] = vocab[i][:-3]

    
with open('./features/features_ngrams_bestfirst_greedystepwise.txt') as f:
    tri = f.readlines()

tri[-5] = 'à-l\'eau-bouillante=1\n'

for i in range(len(tri)):
    trigram = tri[i][:-3].replace('_',' ').replace('-', ' ')
    tri[i] = trigram

del tri[19]



with open('./features/features_verbs.txt') as f:
    verbs = f.readlines()

del verbs[4]
del verbs[13]

verb_groups = [[], [], []]
for i in range(len(verbs)):
    idx = int(verbs[i][-2])-1
    verb_groups[idx].append(verbs[i][:-3])
      

def featurize(dataset):
    
    x = {}

    for i in range(len(dataset)):
        ID = dataset.ID[i]
        features = np.zeros((4))
        title = dataset.titre[i]
        title = title.split()
        features[0] = len(title)
        
        recipe = dataset.preparation[i]
        if type(recipe) == float:
            features[1] = 0
        else:
            recipe = recipe.split()
            features[1] = len(recipe)

        ing = ast.literal_eval(dataset.ingredients[i])
        features[2] = len(ing)

        price = dataset.cout[i]
        if price == 'Bon marché':
            features[3] = 1
        elif price == 'Moyen':
            features[3] = 2
        elif price == 'Assez Cher':
            features[3] = 3
            
        v = np.zeros((22))
        if type(recipe) != float:
            for j in range(len(vocab)):
                if vocab[j] in recipe:
                    v[j] = 1
        features = np.append(features, v)
        
        t = np.zeros((48))
        if type(recipe) != float:
            for j in range(len(tri)):
                if tri[j] in recipe:
                    t[j] = 1
        features = np.append(features, t)

        vg = np.zeros((3))
        if type(recipe) != float:
            for j in range(len(verb_groups)):
                num_verbs = 0
                for k in range(len(verb_groups[j])):
                    if verb_groups[j][k] in recipe:
                        num_verbs += 1
                vg[j] = num_verbs

        features = np.append(features, vg)
        
        x[ID] = features
        
    return x
  

train_features = featurize(train)
dev_features = featurize(dev)

features = {**train_features, **dev_features}
num_features = len(features[50819])

# add the linguistc features to the pretrained embeddings that were loaded
train_data = add_features(train, train_data, features)
dev_data = add_features(dev, dev_data, features)

In [0]:
# define weights for different classes to counter class imbalance

if use_features:
    train_weights = torch.FloatTensor([0.1, 0.1, 0.2, 0.6])
else:
    train_weights = np.bincount([t for t in train.niveau])[0] / np.bincount([t for t in train.niveau])
    train_weights = torch.FloatTensor(np.divide(train_weights, np.sum(train_weights)))
    
if gpu == True:
    train_weights = train_weights.cuda()   
train_weights

In [0]:
# start training using the hyper-parameters defined above

specs = {'embeddings': embeddings, 'embedding_dim': emb_dim, 'hidden_type': hidden_type,
         'n_hidden': n_hidden, 'n_layers': n_layers, 'pool_type': pool_type,
         'n_out': n_out, 'direction': direction, 'dropout': dropout, 'embedding_dropout': embedding_dropout, 'maxlen': maxlen,
         'clipping': clipping, 'batch_size': batch_size, 'learning_rate': learning_rate, 'weight_decay': weight_decay, 'requires_grad': requires_grad, 'train_weights': train_weights, 'num_features': num_features, 'use_features' : use_features}

if gpu:
    m = Model(emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, embedding_dropout).cuda()
else:
    m = Model(emb_dim, hidden_type, n_hidden, n_layers, pool_type, n_out, direction, dropout, embedding_dropout)
    
opt = optim.AdamW(filter(lambda p: p.requires_grad,m.parameters()), learning_rate, weight_decay=weight_decay)
fit(specs, model=m, train_data=train_data, dev_data=dev_data, loss_fn=F.nll_loss, opt=opt, epochs=80, batch_size=batch_size)