# Cylleneus
#### Next-gen corpus search for electronic corpora of ancient languages

In [None]:
# Get everything set up
import codecs, re
from itertools import chain
from textwrap import wrap

import multiwordnet

from cylleneus.corpus import Corpus
from cylleneus.search import Collection, Searcher

# Check MultiWordNet installation
for language in ["common", "english", "french", "hebrew", "italian", "latin", "spanish", "portuguese"]:
    if not multiwordnet.db.exists(language):
        multiwordnet.db.compile(language, verbose=False)

# Convenience method to remove formatting tags from highlighted text
def display_text(text: str):
    subs = [("<pre>", ""), ("</pre>", ""), ("<match>", ""), ("</match>", ""), ("<post>", ""), ("</post>", ""), (r"<em>(.+?)</em>", r"\033[1m\033[36m\1\033[21m\033[0m")]
    for pat, sub in subs:
        text = re.sub(pat, sub, text, re.DOTALL)
    return "\n".join(wrap(text))

The process of executing a query and retrieving search results with Cylleneus comprises several steps.

##### Build a collection

Searches are conducted over an abstraction called a ``Collection``, which is a group of works from one or more corpora.
A ``Collection`` may contain all the works of a corpus, or only some of its works, or even works from different corpora
at the same time. (The types of queries you can then perform over the ``Collection`` will depend on the characteristics
of the included corpora). This means, in the very first instance, that you must select and load a corpus (or multiple
corpora) and use this to create a ``Collection`` object.

In [None]:
# Load one or more corpora
proiel = Corpus("proiel")
if not proiel.searchable:
    proiel.download()

agldt = Corpus("agldt")
if not agldt.searchable:
    agldt.download()

# Create the collection with the combined works of the two corpora
collection = Collection(works=chain(proiel.works, agldt.works))

# Optionally, save the collection for later use
# collection.save("my_collection")

##### Perform the search

Once you have built a collection, you can execute queries over it.

In [None]:
# Create a searcher for this collection
searcher = Searcher(collection)

# Create and execute a query
query = """
<virtus>
""".strip()

# Perform the search over the collection
results = searcher.search(query.strip())

##### Working with results

The results of a search are encapsulated in a ``Search`` object, which exposes several useful properties and methods for
working with results.

In [None]:
# Display search information

# `spec` stores the query string originally provided by the user, whereas
# `query` is the resolved internal representation used for matching against the index
print(f"{results.spec} (= {results.query})")
print(f"{'=' * (len(results.spec + str(results.query)) + 5)}")

# `start` and `end` give the start and end times of the search in human-readable format,
# whilst `start_dt` and `end_dt` store the corresponding datetime objects
print(f"{results.start}\n")

# `count` stores the results count as a tuple, giving the total number of
# matches, the number of documents in which results were found, and the
# number of corpora in which results were found
matches, docs, corpora = results.count
print(f"Found {matches} matches in {docs} documents from {corpora} corpora")

# `duration` gives the total elapsed time of the search in hours, minutes, and seconds
print(f"in total elapsed time of {results.duration} secs.\n")

# `collection` stores the `Collection` object over which the search was executed
for work in results.collection:
   print(f"{work.author}, {work.title} [{work.corpus}]")

# Display results as nicely formatted plaintext
for corpus, author, title, urn, reference, text in results.to_text():
  print(f"{author}, {title}: {reference}\n{display_text(text)}\n")

# Or display the raw highlighted results using the `HitRef` object
# for hlite in results.highlights:
#    print(f"{hlite.author}, {hlite.title}: {hlite.reference}\n{hlite.text}\n")

# Or save the search results in JSON format; if needed, these can later be re-loaded with `from_json()`
# with codecs.open("my_results.json", "w", "utf8") as fp:
#    fp.write(results.to_json())

# Or save the results as a Microsoft Word document, optionally giving a filename;
# if you do not specify one, the original search specification will be used
# results.to_docx("my_results.docx")