In [159]:
import spacy
import wikipediaapi
import os
import pandas as pd
import sys
import spacy
import re
import numpy as np

In [145]:
nlp_en = spacy.load('en')
nlp_de = spacy.load('de')

In [146]:
summaries_en = pd.read_csv('summaries_en.csv', index_col=0)
summaries_de = pd.read_csv('summaries_de.csv', index_col=0).dropna()

In [147]:
def remove_everything_in_parenth_or_brackets (text):
    n = 1  
    while n:
        text1 = text
        text, n = re.subn(r'\([^()]*\)', '', text)
        
    return text

In [148]:
summaries_en['summary_en'] = summaries_en['summary_en'].apply(remove_everything_in_parenth_or_brackets)
summaries_de['summary_de'] = summaries_de['summary_en'].apply(remove_everything_in_parenth_or_brackets)

In [149]:
summaries_en['summary_en'] = summaries_en['summary_en'].apply(\
    lambda text: [token.lemma_.lower() for token in nlp_en(text) if not token.is_stop and not token.is_punct \
                                                and not token.is_space and not token.is_digit])

summaries_de['summary_de'] = summaries_de['summary_de'].apply(\
    lambda text: [token.lemma_.lower() for token in nlp_de(text) if not token.is_stop and not token.is_punct \
                                                and not token.is_space and not token.is_digit])

In [155]:
print(summaries_de.summary_de.sample())

534    [t-12, cloudmaker, 20.000, kg, groß, konventio...
Name: summary_de, dtype: object


In [156]:
def load_words_to_id(language):
    language_to_embeddings_path = {'en': os.path.join(os.getcwd(), 'wiki.multi.en.vec'),
                                   'de': os.path.join(os.getcwd(), 'wiki.multi.de.vec')}
    path = language_to_embeddings_path[language]
    vectors = []
    word2id = {}
    nmax = 50000
    with open(path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    embeddings = np.vstack(vectors)
    return embeddings, word2id

In [191]:
def encode_tokens(tokens, word2id, embeddings):
    num_words = 0
    embedding = np.zeros(embeddings.shape[1], dtype='float')
    for token in tokens:
        if token in word2id.keys():
            num_words += 1
            embedding += embeddings[word2id[token]]

    if num_words:
        return (1.0 / num_words) * embedding
    else: 
        return None

In [192]:
embeddings_en, word2id_en = load_words_to_id('en')
embeddings_de, word2id_de = load_words_to_id('de')

In [193]:
article_embeddings_en = summaries_en['summary_en'].apply(lambda tokens: encode_tokens(tokens, word2id_en, embeddings_en)).dropna()
article_embeddings_de = summaries_de['summary_de'].apply(lambda tokens: encode_tokens(tokens, word2id_de, embeddings_de)).dropna()