In [None]:
from helpers import read_wiki_sents

sents = read_wiki_sents()
sents[:5]

In [None]:
from datasets import load_dataset
train = list(load_dataset("diversifix/inclusive_words")["train"])
train[:3]

In [None]:
data = dict()
for row in train:
    if not row["exclusive"] in data.keys():
        data[row["exclusive"]] = []
    data[row["exclusive"]].append(row)

In [None]:
import spacy
nlp = spacy.load("de_core_news_sm", disable=["ner", "attribute_ruler", "parser"])
nlp.pipeline

In [None]:
from joblib import Memory
memory = Memory("~/.cache", verbose=0)

@memory.cache
def get_docs(n):
    return list(nlp.pipe(sents[:n], batch_size=1000, n_process=4))

docs = get_docs(50_000)
len(docs)

In [None]:
import random
random.seed(93020)

people_sents = []
for doc in docs:
    matches = [t for t in doc if t.pos_ == "NOUN" and t.lemma_ in data.keys()]
    if len(matches) == 0:
        continue
    random.shuffle(matches)
    t = matches[0]
    alternatives = data[t.lemma_]
    if t.morph.get("Number") == ["Sing"]:
        alternatives = [a for a in alternatives if a["applicable"] in ["in_singular", "always"]]
    if len(alternatives) == 0:
        continue
    random.shuffle(alternatives)
    alt = alternatives[0]
    post = " oder " + t.lemma_ if alt["gender_of_inclusive"] == "female" and random.random() > 0.5 else ""
    inclusive = alt["inclusive"] + post
    people_sents.append((doc.text, t.lemma_, inclusive))
(len(people_sents), people_sents[:3])

In [None]:
from helpers import chunks

batches = []
for chunk in chunks(people_sents, 8):
    inputs = ["""(1)\nOriginal: "Die Schüler kamen zu spät."\nMit Ersetzung: "Die Schülerinnen und Schüler kamen zu spät."\n\n(2)\nOriginal: "Sie werden dem neuen Kanzler gratulieren."\nMit Ersetzung: "Sie werden der neuen Kanzlerin oder dem neuen Kanzler gratulieren."\n"""]
    instructions = ["""Führe die folgenden Ersetzungen durch. Verändere den Satz und die Ersatzwörter dazu gegebenenfalls grammatisch, sodass ein grammatisch korrekter und flüssiger Satz entsteht.\n\n(1) Ersetze "Schüler" durch eine entsprechend angepasste Form von "Schülerin oder Schüler".\n(2) Ersetze "Kanzler" durch eine entsprechend angepasste Form von "Kanzlerin oder Kanzler"."""]
    for i, (sent, a, b) in enumerate(chunk, 3):
        inputs.append(f"""({i})\nOriginal: "{sent}"\nMit Ersetzung: ___\n""")
        instructions.append(f"""({i}) Ersetze "{a}" durch eine entsprechend angepasste Form von "{b}".""")
    batches.append((chunk, "\n".join(inputs), "\n".join(instructions)))
len(batches)

In [None]:
%load_ext dotenv
%dotenv

import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")


In [None]:
@memory.cache
def get_replacements(input, instruction):
    response = openai.Edit.create(
        engine="text-davinci-edit-001",
        input=input,
        instruction=instruction,
        temperature=0,
        top_p=1
    )
    return response["choices"][0]["text"]

In [None]:
from itertools import chain
import re

from joblib import Parallel, delayed
from tqdm.notebook import tqdm


def get_unfiltered_training_data(batch, i):
    print(i)
    chunk, input, instruction = batch
    try:
        output = get_replacements(input, instruction)
    except:
        print(batch)
        return []
    replacements = re.findall(r"Mit Ersetzung: \"(.*)\"", output)[2:]
    return [(sent, a, b, rep) for (sent, a, b), rep in zip(chunk, replacements)]


utd = Parallel(n_jobs=2)(
    [delayed(get_unfiltered_training_data)(batch, i) for i, batch in enumerate(batches[:1200])]
)
unfiltered_training_data = list(chain(*utd))


In [None]:
import requests

@memory.cache
def filter_data(data, i):
    print(i)
    out_data = []
    for sent, a, b, rep in data:
        r = requests.post(
            "http://localhost:8081/v2/check",
            data={"text": rep, "language": "de-DE", "enabledCategories": "PUNCTUATION,CASING,COLLOCATIONS,CONFUSED_WORDS,CREATIVE_WRITING,GRAMMAR,MISC,MISUSED_TERMS_EU_PUBLICATIONS,NONSTANDARD_PHRASES,REDUNDANCY,SEMANTICS,TEXT_ANALYSIS,STYLE", "disabledCategories": "TYPOS,TYPOGRAPHY"},
            headers={"Content-Type": "application/x-www-form-urlencoded"},
        )
        matches = r.json()["matches"]
        if len(matches) == 0:
            out_data.append(dict(x=sent, a=a, b=b, y=rep))
        else:
            print(rep)
    return out_data

training_data = list(chain(*Parallel(n_jobs=1)(delayed(filter_data)(data, i) for i, data in enumerate(list(chunks(unfiltered_training_data, 100))))))
(len(unfiltered_training_data), len(training_data))

In [None]:
import json

with open("../data/training_data_gender.json", "w") as f:
    json.dump(training_data, f, ensure_ascii=False, indent=2)

In [None]:
import jsonlines
with jsonlines.open("../data/training_data_gender.jsonl", mode="w") as writer:
    writer.write_all(training_data)