Cylleneus + NLP
===============

Once installed, the Cylleneus engine can be used to perform queries programmatically via the search API. In this way, the engine can be used to build NLP analysis pipelines. One very simple and straightforward use of Cylleneus' search functionality would be to try to find 'intertexts' -- passages of a target text that are lexically similar to some source text -- programmatically. Here's a very basic implementation.

In [None]:
# Get everything set up
import codecs
import json
from copy import copy
from itertools import chain
from pathlib import Path
from textwrap import wrap

import multiwordnet
from tqdm import tqdm

from cylleneus.corpus import Corpus, Work
from cylleneus.search import Collection, Searcher
from cylleneus.settings import CORPUS_DIR

# Because we are going to be processing a text to serve as the
# source for constructing queries programatically, we need some
# nuts-and-bolts text-handling tools.
from cylleneus.engine.analysis.filters import CachedLemmaFilter, CachedSynsetFilter

# The source text will come from the Perseus Digital Library (in
# JSON format), so can use the bespoke tokenizer for this corpus.
from cylleneus.corpus.lat.perseus import Tokenizer

# Check MultiWordNet installation
for language in ["common", "english", "french", "hebrew", "italian", "latin", "spanish"]:
    if not multiwordnet.db.exists(language):
        multiwordnet.db.compile(language, verbose=False)

#### Ngram extraction

To find possible intertexts, we are going to process a source text line by line, using some code adapted from the NLTK to extract a series of ngrams (of specifiable length) to search in the target texts.

In [None]:
# Returns a padded sequence of items before ngram extraction.
def pad_sequence(
    sequence,
    n,
    pad_left=False,
    pad_right=False,
    left_pad_symbol=None,
    right_pad_symbol=None,
):
    sequence = iter(sequence)
    if pad_left:
        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence

# Return the ngrams generated from a sequence of items, as an iterator.
def ngrams(
    sequence,
    n,
    pad_left=False,
    pad_right=False,
    left_pad_symbol=None,
    right_pad_symbol=None,
):
    sequence = pad_sequence(
        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
    )

    history = []
    while n > 1:
        try:
            next_item = next(sequence)
        except StopIteration:
            return
        history.append(copy(next_item))
        n -= 1
    for item in sequence:
        history.append(copy(item))
        yield tuple(history)
        del history[0]

#### Find similar constructions

Once the text has been processed into a series of n-grams, we use Cylleneus' query-building functionality to automatically construct complex phrasal searches. Where possible, each word within a given n-gram is lemmatized in order to abstract away from the specific morphological features of the source text. In the first instance, only the sequential ordering of the n-gram is preserved (as a Sequence query). However, it is possible to loosen this restriction and to look for intertexts where the word order is somehow varied, by specifying a ``slop`` value greater than one (1). Alternatively, Cylleneus can search for 'semantic' intertexts by abstracting away further from the lexicon to query the meanings of words, using ``synsets=True``.

In [None]:
# Find lexically or semantically similar phrases in the reference corpus
# for phrases of length n in a tokenized source text.
def find_similar(tokens, collection, n=3, slop=None, synsets=False):
    searcher = Searcher(collection=collection)

    if not synsets:
        before, after = '<', '>'
    else:
        before, after = '[', ']'

    # Generate and process n-grams
    for ngram in tqdm(ngrams(tokens, n), desc="Searching"):
        queries = []

        # Lemmatize, and optionally synsetize, every word in the ngram
        for gram in ngram:
            subqueries = set()
            if not synsets:
                lemmatizer = CachedLemmaFilter()
                for lemma in lemmatizer([copy(gram),], cached=False, mode='query'):
                    subqueries.add(lemma.text.split('=')[0])
            else:
                synsetizer = CachedSynsetFilter()
                for synset in synsetizer(lemmatizer([copy(gram),], mode='query'), mode='query'):
                    if synset.text:
                        subqueries.add(synset.text)

            # Construct subquery
            if len(subqueries) == 0:
                queries.append(f"{gram.text}")
            elif len(subqueries) == 1:
                queries.append(f"{before}{list(subqueries)[0]}{after}")
            else:
                queries.append(f'''({' OR '.join([f"{before}{subquery}{after}" for subquery in subqueries])})''')

        # Join all subqueries into a single complex proximity or adjacency query
        if slop:
            query = f'''"{' AND '.join(queries)}"{f'~{slop}' if slop else ''}'''
        else:
            query = f'''{' THEN '.join(queries)}'''

        # Execute the query against the given work
        search = searcher.search(query)

        if search:
            for result in search.to_text():
                yield ngram, result

#### Source text

First, we need to specify the source text.

##### Corpus

In [None]:
# List available corpora
# print("\n".join([f'{name} [{meta.language}] {meta.description}' for name, meta in manifest.items() if meta.repo["location"] == "remote"]))

corpus = Corpus("perseus")
if not corpus.searchable:
    corpus.download()

##### Work

In [None]:
# List works in selected corpus
for docix, work in sorted(corpus.manifest.items(), key=lambda x: x[0]):
    print(f"[{docix}] {work['author']}, {work['title']}")

# Select the source text by document index number
work = corpus.work_by_docix(0)  # Vergil's Aeneid

#### Target texts

Next, we create a ``Collection`` of target texts (including from different corpora).

In [None]:
# Define a collection of texts in which to search
collection = Collection()
collection.add(corpus.work_by_docix(1))  # Vergil's Eclogues

#### Run the search

In [None]:
# For very long texts, this might take a while!
results = []
for filename in work.filename:
    # Load target text
    with codecs.open(work.corpus.text_dir / filename, 'r', 'utf8') as fp:
        doc = json.load(fp)
    meta = work.meta

    # Tokenize the target text
    tokenizer = Tokenizer()
    tokens = tokenizer({ "text": doc['text'], "meta": meta }, mode='index')

    # Use the token list to create n-grams of length n, and search for these in the target text
    hlites = find_similar(tokens, collection, n=3)
    results.extend([hlite for hlite in hlites])

#### Results

In [None]:
# Display the results...
print(f"{work.author}, {work.title}")
for ngram, (corpus, author, title, urn, reference, text) in results:
    print(f"\n{'-'.join(set([', '.join([f'{k}: {v}' for k, v in token.meta.items() if k in token.meta['meta'].split('-')]) for token in ngram]))}:", ", ".join([f"'{token.text}'" for token in ngram]))
    print(f"{author}, {title} {reference}")
    subs = [("<pre>", ""), ("</pre>", ""), ("<match>", ""), ("</match>", ""), ("<post>", ""), ("</post>", ""), ("<em>", "*"), ("</em>", "*")]
    for pat, sub in subs:
        text = text.replace(pat, sub)
    print("\n".join(wrap(text)))

# ... or save the results locally
# with codecs.open("", "w", "utf8") as fp:  # Add file name here
#     fp.write(f"{work.author.upper()}, {work.title.upper()}\n")
#     for ngram, (corpus, author, title, urn, reference, text) in results:
#         fp.write(f"\n\n{'-'.join(set([', '.join([f'{k}: {v}' for k, v in token.meta.items() if k in token.meta['meta'].split('-')]) for token in ngram]))}:", ", ".join([f"'{token.text}'" for token in ngram]))
#         fp.write(f"\n{author}, {title} {reference}")
#         subs = [("<pre>", ""), ("</pre>", ""), ("<match>", ""), ("</match>", ""), ("<post>", ""), ("</post>", ""), ("<em>", "*"), ("</em>", "*")]
#         for pat, sub in subs:
#             text = text.replace(pat, sub)
#         print("\n".join(wrap(text)))