In [7]:
import os
import regex
import sys
import numpy as np
import pandas as pd

# And of course we need the text_extensions_for_pandas library itself.
try:
    import text_extensions_for_pandas as tp
except ModuleNotFoundError as e:
    # If we're running from within the project source tree and the parent Python
    # environment doesn't have the text_extensions_for_pandas package, use the
    # version in the local source tree.
    if not os.getcwd().endswith("notebooks"):
        raise e
    if ".." not in sys.path:
        sys.path.insert(0, "..")
    import text_extensions_for_pandas as tp

In [8]:
# Sample text input.
text = """\
In AD 932, King Arthur and his squire, Patsy, travel throughout Britain \
searching for men to join the Knights of the Round Table. Along the way, \
he recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad \
the Pure, Sir Robin the Not-Quite-So-Brave-as-Sir-Lancelot, and Sir \
Not-Appearing-in-this-Film, along with their squires and Robin's troubadours.\
"""


In [9]:
text

"In AD 932, King Arthur and his squire, Patsy, travel throughout Britain searching for men to join the Knights of the Round Table. Along the way, he recruits Sir Bedevere the Wise, Sir Lancelot the Brave, Sir Galahad the Pure, Sir Robin the Not-Quite-So-Brave-as-Sir-Lancelot, and Sir Not-Appearing-in-this-Film, along with their squires and Robin's troubadours."

In [10]:
# Define a crude tokenizer to split by words, for example use only.
def tokenize_with_offsets(text):
    """Return offsets of tokens from given `text`"""
    splits = text.split(" ")
    begins = np.cumsum([0] + [len(s) + 1 for s in splits[:-1]])
    ends = begins + [len(s.strip(",.")) for s in splits]
    return begins, ends

In [11]:
# Tokenize the text to get begin, end offsets and construct a `SpanArray`.
begins, ends = tokenize_with_offsets(text)
tokens = tp.SpanArray(text, begins, ends)

# The array nicely renders in HTML to show offsets, text of the span,
# and highlighted target text.
tokens

Unnamed: 0,begin,end,covered_text
0,0,2,In
1,3,5,AD
2,6,9,932
3,11,15,King
4,16,22,Arthur
5,23,26,and
6,27,30,his
7,31,37,squire
8,39,44,Patsy
9,46,52,travel
