In [1]:
import tarfile
import urllib.request
url = "http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_wikipedia_2021_10K.tar.gz"
with urllib.request.urlopen(url) as f:
    stream = urllib.request.urlopen(url)
    f = tarfile.open(fileobj=stream, mode="r|gz")
    f.extractall()


In [2]:
import random
random.seed(45394)
with open("deu_wikipedia_2021_10K/deu_wikipedia_2021_10K-sentences.txt", "r") as f:
    lines = f.read().split("\n")[:-1]
    sents = [l.split("\t")[1] for l in lines]
    random.shuffle(sents)
sents[:5]

['Gemeinsam mit Inspektor Kemp, einem ehemaligen Schüler des aus anderen Christie-Romanen bekannten Inspektor Battle, untersucht er nun die Todesfälle.',
 'Im Juni 2020 wurde die modernisierte Version M-84AS1 vorgestellt.',
 'In Großbritannien erfreute sich Metternich ungeachtet des deutsch-kritischen Klimas großen Ansehens.',
 'Gertrud von le Fort teilt weder irgendeine Jahreszahl noch den Namen des Königs mit.',
 'In Rio unterstützt EBX Sport-, Bewirtungs-, Gastronomie-, Gesundheits- und Schönheitsinitiativen.']

In [3]:
import de_dep_news_trf
from itertools import chain
nlp = de_dep_news_trf.load()
def get_nouns(sent):
    doc = nlp(sent)
    return [(doc, w.i) for w in doc if w.pos_ == "NOUN" and len(w.text) > 4]
nouns = list(chain(*[get_nouns(s) for s in sents[:100]]))




In [4]:
import re

def is_range(l):
    a = l[0]
    for b in l[1:]:
        if b - a != 1:
            return False
        a = b
    return True

def is_indefinite_nom(t):
    if t.morph.get("Case") == ["Nom"]:
        in_sing = any([c.lemma_ in ["ein", "einen"] for c in t.children])
        in_plur = not any(c.lemma_ in ["der"] for c in t.children)
        return in_sing or in_plur
    return False

def is_article(parent, t):
    return t.head == parent and t.lemma_ in ["der", "ein", "einen"]

def get_phrase(doc_and_i):
    doc, i = doc_and_i
    noun = doc[i]
    if all([t.is_alpha for t in noun.subtree]) \
        and list(noun.subtree)[0].i > 0 \
        and len(list(noun.subtree)) <= 5 \
        and is_indefinite_nom(noun) \
        and is_range(sorted([t.i for t in noun.subtree])):
        ts = [t.text_with_ws for t in noun.subtree if not is_article(noun, t)]
        return re.sub(r"[^a-zäöÜA-ZÄÖÜ]+$", "", "".join(ts))
    return None

phrases = [phrase for noun in nouns if (phrase := get_phrase(noun)) is not None]

In [5]:
phrases[:10]

['neue Straßenerschließung',
 'jahrhundertealte Gravuren',
 'Bühne',
 'Ausflugslokal',
 'dieser Standort',
 'mehrere vergebliche Versuche einer Rettung',
 'Prinz Carl',
 'strukturierter Promotionsstudiengang an der Universitätsmedizin',
 'Taschenbuch',
 'ihr Kürzel MPEG']

In [6]:
from joblib import Memory
memory = Memory(".cache", verbose=0)

In [7]:
%load_ext dotenv
%dotenv
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

@memory.cache
def replace(a, b, sent):
    response = openai.Edit.create(
        engine="text-davinci-edit-001",
        input=sent,
        instruction=f"Ersetze \"{a}\" durch \"{b}\". Passe die Grammatik entsprechend an.",
        temperature=0,
        top_p=1
    )
    return response["choices"][0]["text"]

In [8]:
@memory.cache
def replace_batch(l):
    inputs = [
        f"({k})\nOriginal: {doc[i].doc.text}\nMit Ersetzung: ___\n"
        for k, ((doc, i), _) in enumerate(l)
    ]
    instructions = [
        f"({k}) Ersetze {doc[i].text} durch {phrase}. Passe die Grammatik entsprechend an.\n" 
        for k, ((doc, i), phrase) in enumerate(l)
    ]
    response = openai.Edit.create(
        engine="text-davinci-edit-001",
        input="\n".join(inputs),
        instruction="\n".join(instructions),
        temperature=0,
        top_p=1
    )
    replaceds = re.findall(r"Mit Ersetzung: (.*)\n", response["choices"][0]["text"])
    return [
        dict(x=doc[i].doc.text,replace=doc[i].text,replace_with=phrase,y=replaced)
        for replaced, ((doc, i), phrase) in zip(replaceds, l)
    ]

In [9]:
import json
from tqdm.notebook import tqdm

random.seed(659342)
random.shuffle(nouns)
random.shuffle(phrases)
data = []

def chunks(l, n):
    """
    Yield successive n-sized chunks from l.
    from https://stackoverflow.com/q/312443/10190810
    """
    for i in range(0, len(l), n):
        yield l[i:i + n]

training_data = []
for chunk in tqdm(list(chunks(list(zip(nouns, phrases)), 5))):
    res = replace_batch(tuple(chunk))
    training_data = list(chain(training_data, res)) 
    with open("training_data.json", "w") as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/5 [00:00<?, ?it/s]

In [10]:
len(training_data)

23