# Describe: Creating 'cloze' exercises with Cicero (with vector semantics)

In [None]:
from natsort import natsorted
from pprint import pprint
from latintools import preprocess
import random

In [None]:
from cltkreaders.lat import LatinTesseraeCorpusReader

T = LatinTesseraeCorpusReader()

cicero = natsorted([fileid for fileid in T.fileids() if 'de_finibus' in fileid])
pprint(cicero[:10])

In [None]:
def custom_preprocess(text):
    text = preprocess(text, punctuation=True)
    return text

import spacy
nlp = spacy.load('la_core_web_md')
text = next(T.texts(cicero, preprocess=custom_preprocess))
doc = nlp(text)

In [None]:
import numpy as np

data = {}
data['name'] = np.array([item.text for item in nlp.vocab])
data['vector'] = np.array([item.vector for item in nlp.vocab])

In [None]:
from annoytools import AnnoyIndex
index = AnnoyIndex(data["vector"], data["name"])
index.build()

In [None]:
def get_word_idx(word):
    try:
        return data["name"].tolist().index(word)
    except:
        return None

word = 'nam'
word_idx = get_word_idx(word)
print(word_idx)

In [None]:
query = index.query(data["vector"][word_idx], k=25)

results = []

for i, q in enumerate(query, 0):
    results.append(q)

print(results)


In [None]:
sents = [sent for sent in doc.sents if len(sent) > 10 and len(sent) < 25]

In [None]:
test_sent = sents[2]
print(test_sent)

In [None]:
def create_cloze_(sent, seed=3):
    random.seed(seed)
    sent = sent.as_doc()
    remove_options = [token.i for token in sent if token.is_alpha]
    remove_choice = random.choice(remove_options)
    cloze = ' '.join([token.text if token.i != remove_choice else '_____' for token in sent])
    return sent, cloze, sent[remove_choice].text

pprint(create_cloze_(test_sent))

In [None]:
original, cloze, mask = create_cloze_(test_sent)

In [None]:
mask_idx = get_word_idx(mask)
print(mask)
print(mask_idx)
print(data["vector"][mask_idx])

In [None]:
query = index.query(data["vector"][mask_idx], k=1000)

results = []

for i, q in enumerate(query, 0):
    results.append(q)

In [None]:
for i, result in enumerate(results[:10], 0):
    print(i, result)

In [None]:
answer = mask
random.seed(1)
close_wrong_answer = random.sample(results[1:3],1)
closish_wrong_answer = random.sample(results[10:20],1)
far_wrong_answer = random.sample(results[-10:],1)
answer_list = [answer] + close_wrong_answer + closish_wrong_answer + far_wrong_answer
print(answer_list)

In [None]:
print(original)

In [None]:
print(cloze)

In [None]:
# scramble list
random.shuffle(answer_list)
print(answer_list)