In [0]:
import pickle
import pandas as pd
import os

import flair
from flair.data import Sentence
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import XLMEmbeddings

import torch

from tqdm import tnrange, tqdm_notebook, tqdm

In [0]:
# tools for the extraction of pretrained embeddings for each recipe

def initialize_embeddings(embeddings):
    """given a list of embedding names, initialize pretrained embeddings, stack them, and extract the dimension"""
    
    emb_list = []
    for e in embeddings:
        if e == 'fasttext':
            fasttext_embedding = WordEmbeddings('fr')
            emb_list.append(fasttext_embedding)
        elif e == 'flair':
            flair_forward  = FlairEmbeddings('fr-forward')
            flair_backward = FlairEmbeddings('fr-backward')
            emb_list.append(flair_forward)
            emb_list.append(flair_backward)
        elif e == 'xlm':
            xlm_embedding = XLMEmbeddings('xlm-mlm-enfr-1024')
            emb_list.append(xlm_embedding)
        elif e == 'xlm-multi':
            xlm_embedding_multi = XLMEmbeddings('xlm-mlm-tlm-xnli15-1024', pooling_operation='last')
            emb_list.append(xlm_embedding_multi)
        elif e == 'bert':
            bert_embedding = BertEmbeddings('bert-base-multilingual-cased', layers='-1')
            emb_list.append(bert_embedding)
        elif e == 'camembert':
            return None, 768

    stacked_embeddings = StackedEmbeddings(embeddings = emb_list).eval()
    s = Sentence('this is to extract embedding dimension!')
    stacked_embeddings.embed(s)
    emb_dim = len(s[0].embedding)
    
    return stacked_embeddings, emb_dim
  
  
  
def dataset_creator(data, stacked_embeddings, emb_dim, maxlen):
    """given dataset, initialized pretrained embeddings, and embedding dimension, output embedding vectors"""
    
    dataset = []
    for i in tqdm_notebook(range(len(data))):   
      # empty tensor for words #

        sample = torch.zeros(0,emb_dim).cuda()
            
        text = ''
        if type(data.titre[i]) == str:
            text += data.titre[i]
            text += ', '
        if type(data.preparation[i]) == str:
            text += data.preparation[i]
        
        if embeddings == ['camembert']:
            limited = ' '.join(text.split(' ')[:maxlen])
            tokens = camembert.encode(limited)
            emb = camembert.extract_features(tokens).squeeze().cpu()
            dataset.append((i, emb, emb.shape[0], data.niveau[i], data.plat[i]))

        else:
            sentence = Sentence(' '.join([str(tok).split(' ')[-1] for tok in Sentence(text)[:maxlen]]))
            stacked_embeddings.embed(sentence)
            # for every word #
            for token in sentence:
                # storing word Embeddings of each word in a sentence #
                sample = torch.cat((sample,token.embedding.view(-1,emb_dim)),0)
            dataset.append((i, sample, sample.shape[0], data.niveau[i], data.plat[i]))
    
    return dataset

In [0]:
# load test set

test = pd.read_csv('/content/gdrive/My Drive/LREC/data/test1.csv')

In [0]:
# define embedding type and maximum number of words here
embeddings = ['bert']
maxlen = 100

if 'camembert' in embeddings:
    camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0')
    camembert.eval()
    for param in camembert.parameters():
        param.requires_grad = False

In [0]:
#initialize embeddings

stacked_embeddings, emb_dim = initialize_embeddings(embeddings)
print(emb_dim)

In [0]:
# vectorize data and save embeddings in 'embeddings' folder

if not os.path.exists('./embeddings'):
    os.mkdir('./embeddings')

test_data = dataset_creator(test, stacked_embeddings, emb_dim, maxlen)

with open('./embeddings/test_emb.pickle', 'wb') as handle:
    pickle.dump(test_data, handle)