In [None]:
!pip install spacy -U
!python -m spacy download en_core_web_lg
!pip install lemminflect
import spacy
import lemminflect
from tqdm import tqdm

try:
    nlp = spacy.load("en_core_web_lg", disable = ["parser", "ner"])
except OSError:
    print(
        "The runtime needs to be restarted otherwise COLAB is unable to find the just installed model."
    )
    exit()

In [None]:
if "google.colab" in str(get_ipython()):
    COLAB = True
    !git clone https://github.com/exc4l/eigolingo
    eigodir = "eigolingo/"
    drivedir = "eigolingo/lists/"
    from google.colab import files
else:
    COLAB = False
    eigodir = "./"
    drivedir = "lists/"

In [None]:
def get_testcases(input):
    return [f"{input}", f"abc {input}", f"{input} abc", f"abc {input} abc"]

def read_dict(textlines):
    rdict = dict()
    for line in textlines:
        key, value = line.split(":")
        rdict[key] = value
    return rdict

In [None]:
# supplements
with open(
    drivedir + "en-drv-1610-douay-rheims-bible-1-0.dic", "r", encoding="utf-8"
) as f:
    data = f.read().splitlines()
bible1set = {w for w in data if w.islower() and w.isalpha()}
with open(
    drivedir + "en-kjv-1611-king-james-bible-1-0.dic", "r", encoding="utf-8"
) as f:
    data = f.read().splitlines()
bible2set = {w for w in data if w.islower() and w.isalpha()}
bibleset = bible1set.union(bible2set)
with open(drivedir + "chemistry.dic", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
chemset = {w for w in data if w.islower() and w.isalpha()}
with open(drivedir + "medterms.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
medset = {w for w in data if w.islower() and w.isalpha()}

with open(drivedir + "engterms.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
engset = {w for w in data if w.islower() and w.isalpha()}

with open(
    drivedir + "wordlist_marcoagpinto_20210301_252214w.txt", "r", encoding="utf-8"
) as f:
    data = f.read().splitlines()
marcoset = {w for w in data if w.islower() and w.isalpha()}

supset = set.union(*[bibleset, chemset, medset, engset, marcoset])
print(len(supset))

with open(drivedir + "names.txt", "r", encoding="utf-8") as f:
    data = f.read().lower().splitlines()
nameset = set(data)

# remove names
for ele in nameset:
    if ele.lower() in supset:
        supset.remove(ele.lower())
# user defined list of additional entries
with open(drivedir + "add_entries.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
extraset = {w for w in data}

supset = supset.union(extraset)

print(len(supset))

# countries and cities and additional (will be removed before saving)
with open(drivedir + "countries.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
countries = {w.lower() for w in data}
with open(drivedir + "cities.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
cities = {w.lower() for w in data}
with open(drivedir + "additional_removal.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
addiset = {w.lower() for w in data}
geoset = set.union(*[countries, cities, addiset])
print(len(geoset))


# filtering certain letters before processing in spacy
with open(drivedir + "letters_to_filter.txt", "r", encoding="utf-8") as f:
    letters_to_filter = f.read().replace("\n", "")
filtertrans = str.maketrans("", "", letters_to_filter)

# change certain dictionary entries before saving:
with open(drivedir + "change_dictionary.txt", "r", encoding="utf-8") as f:
    change_dict = read_dict(f.read().splitlines())

In [None]:
# main corpus
with open(drivedir + "create95hacknoroman.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
    data = data[44:]
c95 = {w for w in data if w.islower() and w.isalpha()}
with open(drivedir + "create80hacknoroman.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
    data = data[44:]
c80 = {w for w in data if w.islower() and w.isalpha()}
with open(drivedir + "create70hacknoroman.txt", "r", encoding="utf-8") as f:
    data = f.read().splitlines()
    data = data[44:]
c70 = {w for w in data if w.islower() and w.isalpha()}

In [None]:
prepset = c95.union(supset)
prepset = prepset.difference(geoset)
for ele in list(prepset):
    if ele != ele.translate(filtertrans):
        print(ele)
        prepset.remove(ele)

In [None]:
# consider the dict in the repository
with open(eigodir + "dict70.txt", "r", encoding="utf-8") as f:
    prev_dict = read_dict(f.read().splitlines())
with open(eigodir + "dict80.txt", "r", encoding="utf-8") as f:
    prev_dict.update(read_dict(f.read().splitlines()))
with open(eigodir + "dict95.txt", "r", encoding="utf-8") as f:
    prev_dict.update(read_dict(f.read().splitlines()))

In [None]:
resulting_dict = dict()
for ele in tqdm(prepset):
    if ele in resulting_dict:
        continue
    if ele in prev_dict:
        resulting_dict[ele] = prev_dict[ele]
        continue
    results = set()
    for w in get_testcases(ele):
        doc = nlp(w)
        for token in doc:
            if token.text != "abc":
                results.add(token._.lemma())
    if not results:
        continue
    res = min(results, key=len)
    if res == ele:
        if len(results) > 1:
            results.remove(res)
            res = min(results, key=len)
            resulting_dict[ele] = res
        else:
            resulting_dict[ele] = res
    else:
        resulting_dict[ele] = res
for ele in tqdm(list(resulting_dict.values())):
    if ele not in resulting_dict and ele not in geoset and ele:
        resulting_dict[ele] = ele
if "" in resulting_dict:
    del resulting_dict[""]

for key, val in change_dict.items():
    if key in resulting_dict:
        resulting_dict[key] = val

In [None]:
print("New:\n")
for key in resulting_dict.keys():
    if key not in prev_dict:
        print(key)
print("\nRemoved:\n")
for key in prev_dict.keys():
    if key not in resulting_dict:
        print(key)

In [None]:
# generate dict70, 80
prepset = c80.union(supset)
prepset = prepset.difference(geoset)
for ele in list(prepset):
    if ele != ele.translate(filtertrans):
        prepset.remove(ele)

resulting_dict80 = dict()
for ele in tqdm(prepset):
    if ele == "":
        print("yep")
    if ele in resulting_dict:
        resulting_dict80[ele] = resulting_dict[ele]
for ele in tqdm(list(resulting_dict80.values())):
    if ele not in resulting_dict80 and ele not in geoset and ele:
        resulting_dict80[ele] = ele

prepset = c70.union(supset)
prepset = prepset.difference(geoset)
for ele in list(prepset):
    if ele != ele.translate(filtertrans):
        prepset.remove(ele)

resulting_dict70 = dict()
for ele in tqdm(prepset):
    if ele in resulting_dict:
        resulting_dict70[ele] = resulting_dict[ele]
for ele in tqdm(list(resulting_dict70.values())):
    if ele not in resulting_dict70 and ele not in geoset and ele:
        resulting_dict70[ele] = ele

for key, val in change_dict.items():
    if key in resulting_dict80:
        resulting_dict80[key] = val
for key, val in change_dict.items():
    if key in resulting_dict70:
        resulting_dict70[key] = val


with open("dict70.txt", "w", encoding="utf-8") as wr:
    for k in sorted(list(resulting_dict70.keys())):
        wr.write(f"{k}:{resulting_dict[k]}\n")
with open("dict80.txt", "w", encoding="utf-8") as wr:
    for k in sorted(list(resulting_dict80.keys())):
        if k in resulting_dict70:
            continue
        wr.write(f"{k}:{resulting_dict[k]}\n")
with open("dict95.txt", "w", encoding="utf-8") as wr:
    for k in sorted(list(resulting_dict.keys())):
        if k in resulting_dict70 or k in resulting_dict80:
            continue
        wr.write(f"{k}:{resulting_dict[k]}\n")
if COLAB:
    files.download("dict80.txt")
    files.download("dict70.txt")
    files.download("dict95.txt")

In [None]:
# generate wordlists
wl70 = sorted(list(resulting_dict70.keys()))
wl80 = sorted(list(resulting_dict80.keys()))
wl95 = sorted(list(resulting_dict.keys()))


def write_list(filename, datalist):
    with open(filename, "w", encoding="utf-8") as wr:
        wr.write("\n".join(datalist))


write_list("wordlist70.txt", wl70)
write_list("wordlist80.txt", wl80)
write_list("wordlist95.txt", wl95)

if COLAB:
    files.download("wordlist70.txt")
    files.download("wordlist80.txt")
    files.download("wordlist95.txt")

In [None]:
import shutil

In [None]:
shutil.rmtree("eigolingo/")