In [None]:
import os

import more_itertools
import numpy as np
import pandas
import requests
import torch
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer
from sentence_transformers.models import WordEmbeddings
from tqdm.notebook import tqdm
import os
import util

In [None]:
def download_file(url, output):
    response = requests.get(url, stream=True)
    total_size_in_bytes= int(response.headers.get('content-length', 0))
    block_size = 1024
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    with open(output, 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()

In [None]:
poems_dir = './corpus'
poems_metadata = pandas.read_csv('./lyrik_metadata.tsv', sep='\t', index_col='ID')

In [None]:
def tokenized_chunks_gen(model, tqdmargs=dict()):
    for id in tqdm(poems_metadata.index, **tqdmargs):
        content = open(poems_dir + '/' + id + '.txt').read().strip()
        content = poems_metadata.loc[id, 'Titel'] + ' ' + content
        tokenized = model.tokenizer.tokenize(content)
        if len(tokenized) > model.get_max_seq_length():
            continue
        yield id, content

def make_embedding_from_sentence_transformer(model, name, device='cpu'):
    model = model.to(device)
    chunksize = 16
    filenames = []
    vectors = []
    for chunk in more_itertools.chunked(tokenized_chunks_gen(model, tqdmargs={'desc': name}), n=chunksize):
        for x in chunk:
            assert len(x) < model.get_max_seq_length()
        filenames.extend([x[0] for x in chunk])
        vectors.extend(model.encode([x[1] for x in chunk], convert_to_tensor=True))

    emb = KeyedVectors(vectors[0].shape[0])
    emb.add_vectors(filenames, np.array(torch.stack(vectors).cpu()))
    emb.save(f'./embeddings/{name}.kv')

In [None]:
make_embedding_from_sentence_transformer(SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1'),
                                         'paraphrase-XLM-R', 'cuda')
make_embedding_from_sentence_transformer(SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2'),
                                         'paraphrase-mpnet', 'cuda')
make_embedding_from_sentence_transformer(SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
                                         'paraphrase-MiniLM', 'cuda')
cross_en_de_roberta = SentenceTransformer('T-Systems-onsite/cross-en-de-roberta-sentence-transformer')
cross_en_de_roberta._first_module().max_seq_length = 128
make_embedding_from_sentence_transformer(cross_en_de_roberta, 'cross-en-de-roberta', 'cuda')
del cross_en_de_roberta

In [None]:

gbert_hidden = Transformer('deepset/gbert-base', model_args={'output_hidden_states': True})
for pname, pfn in zip(['mean', 'median', 'meannorm'], [util.pooling_mean, util.pooling_median, util.pooling_meannorm]):
    pooling = util.BERTHiddenPooling(gbert_hidden.get_word_embedding_dimension(), layers=[-1], pooling_method=pfn)
    model = SentenceTransformer(modules=[gbert_hidden, pooling], device='cuda').cuda()
    make_embedding_from_sentence_transformer(model, f'gbert-base-{pname}', 'cuda')

for pname, pfn in zip(['mean', 'median', 'meannorm'], [util.pooling_mean, util.pooling_median, util.pooling_meannorm]):
    pooling = util.BERTHiddenPooling(gbert_hidden.get_word_embedding_dimension(), layers=list(range(13)), pooling_method=pfn)
    model = SentenceTransformer(modules=[gbert_hidden, pooling], device='cuda').cuda()
    make_embedding_from_sentence_transformer(model, f'gbert-base-alllayers-{pname}', 'cuda')

del gbert_hidden

In [None]:
if not os.path.exists('./models/vectors-deepset-glove.txt'):
    download_file('https://int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt', './models/vectors-deepset-glove.txt')

glove = WordEmbeddings.from_text_file(embeddings_file_path='./models/vectors-deepset-glove.txt', tokenizer=util.WordTokenizer(stop_words=stopwords.words('german')))

In [None]:
for pname, pfn in zip(['mean', 'median', 'meannorm'], [util.pooling_mean, util.pooling_median, util.pooling_meannorm]):
    pooling = util.CustomPooling(glove.get_word_embedding_dimension(), pooling_method=pfn)
    model = SentenceTransformer(modules=[glove, pooling], device='cpu')
    make_embedding_from_sentence_transformer(model, f'glove-{pname}', 'cpu')

del glove

In [None]:
# NOTICE: Poem embeddings for the Fasttext language model are already provided in the repository.

# def load_fasttext(filepath):
#     fIn = open(filepath)
#     fIn.readline()
#     iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings")
#     embeddings_dimension = None
#     vocab = []
#     embeddings = []
#     for line in iterator:
#         split = line.rstrip().split(' ')
#         word = split[0]
#
#         if embeddings_dimension == None:
#             embeddings_dimension = len(split) - 1
#             vocab.append("PADDING_TOKEN")
#             embeddings.append(np.zeros(embeddings_dimension))
#
#         if (len(split) - 1) != embeddings_dimension:
#             print("ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.")
#             continue
#
#         vector = np.array([float(num) for num in split[1:]])
#         embeddings.append(vector)
#         vocab.append(word)
#
#     tokenizer = util.WordTokenizer(stop_words=stopwords.words('german'))
#     embeddings = np.asarray(embeddings)
#     tokenizer.set_vocab(vocab)
#     return WordEmbeddings(tokenizer=tokenizer, embedding_weights=embeddings)
#
#
# if not os.path.exists('./models/vectors-cohure-fasttext.txt'):
#     download_file('TODO', './models/vectors-cohure-fasttext.txt')
#
# fasttext = load_fasttext('./models/vectors-cohure-fasttext.txt')
#
# for pname, pfn in zip(['mean', 'median', 'meannorm'], [util.pooling_mean, util.pooling_median, util.pooling_meannorm]):
#     pooling = util.CustomPooling(fasttext.get_word_embedding_dimension(), pooling_method=pfn)
#     model = SentenceTransformer(modules=[fasttext, pooling], device='cpu')
#     make_embedding_from_sentence_transformer(model, f'fasttext-{pname}', 'cpu')
#
# del fasttext

In [None]:
# TODO TFIDF, MFW