In [None]:
import random
random.seed(45394)

In [None]:
from helpers import read_wiki_sents

sents = read_wiki_sents()
sents[:5]

In [None]:
from itertools import chain
import spacy

nlp = spacy.load("de_dep_news_trf")


def get_nouns(sent):
    doc = nlp(sent)
    return [(doc, w.i) for w in doc if w.pos_ == "NOUN" and len(w.text) > 4]


nouns = list(chain(*[get_nouns(s) for s in sents[:100]]))
len(nouns)


In [None]:
import re

def is_range(l):
    a = l[0]
    for b in l[1:]:
        if b - a != 1:
            return False
        a = b
    return True

def is_indefinite_nom(t):
    if t.morph.get("Case") == ["Nom"]:
        in_sing = any([c.lemma_ in ["ein", "einen"] for c in t.children])
        in_plur = not any(c.lemma_ in ["der"] for c in t.children)
        return in_sing or in_plur
    return False

def is_article(parent, t):
    # articles and also pronouns
    return t.head == parent and t.pos_ == "DET"

def get_phrase(doc_and_i):
    doc, i = doc_and_i
    noun = doc[i]
    if all([t.is_alpha for t in noun.subtree]) \
        and list(noun.subtree)[0].i > 0 \
        and len(list(noun.subtree)) <= 5 \
        and is_indefinite_nom(noun) \
        and is_range(sorted([t.i for t in noun.subtree])):
        ts = [t.text_with_ws for t in noun.subtree if not is_article(noun, t)]
        return re.sub(r"[^a-zäöÜA-ZÄÖÜ]+$", "", "".join(ts))
    return None

phrases = [phrase for noun in nouns if (phrase := get_phrase(noun)) is not None]
len(phrases)

In [None]:
phrases[:10]

In [None]:
from joblib import Memory
memory = Memory("~/.cache", verbose=0)

In [None]:
%load_ext dotenv
%dotenv
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

@memory.cache
def replace(a, b, sent):
    response = openai.Edit.create(
        engine="text-davinci-edit-001",
        input=sent,
        instruction=f"Ersetze \"{a}\" durch \"{b}\". Passe die Grammatik entsprechend an.",
        temperature=0,
        top_p=1
    )
    return response["choices"][0]["text"]

In [None]:
@memory.cache
def replace_batch(l):
    inputs = [
        f"""
        Ersetze "{doc[i].lemma_.capitalize()}" durch "{phrase}".
        Originaltext: {doc[i].doc.text}
        Text mit Ersetzung: ___
        """
        for (doc, i), phrase in l]
    instruction = """Führe die Ersetzungen durch. Nimm grammatische Anpassungen vor, sodass der entstehende Satz mit der Ersetzung grammatisch einwandfrei ist. Wenn eine grammatisch einwandfreie Ersetzung nicht möglich ist, dann trage "FEHLER" ein"."""
    response = openai.Edit.create(
        engine="text-davinci-edit-001",
        input="\n".join(inputs),
        instruction=instruction,
        temperature=0,
        top_p=1
    )
    # print(response["choices"][0]["text"])
    replaceds = re.findall(r"Text mit Ersetzung: (.*)\n", response["choices"][0]["text"])
    return [
        dict(x=doc[i].doc.text,replace=doc[i].lemma_.capitalize(),replace_with=phrase,y=replaced)
        for replaced, ((doc, i), phrase) in zip(replaceds, l)
    ]

In [None]:
import json
from tqdm.notebook import tqdm

from helpers import chunks

random.seed(659342)
random.shuffle(nouns)
random.shuffle(phrases)
data = []

training_data = []
for chunk in tqdm(list(chunks(list(zip(nouns, phrases)), 1))):
    res = replace_batch(tuple(chunk))
    training_data = list(chain(training_data, res)) 
    with open("../data/training_data_general.json", "w") as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)
