<a href="https://colab.research.google.com/github/edponce/DoyleInvestigators/blob/master/Project1_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import os
import re
import urllib.request
import urllib.parse


###$\color{brown}{\rm Corpus~Selection}$

In [19]:
CORPUS_URL = {
    'The Valley of Fear': "http://www.gutenberg.org/files/3289/3289.txt",
    'A Study of Scarlet': "http://www.gutenberg.org/files/244/244.txt",
    'The Sign of the Four': "http://www.gutenberg.org/files/2097/2097.txt",
    'The Hound of the Baskervilles': "http://www.gutenberg.org/files/2852/2852.txt",
}

###$\color{brown}{\rm Read~Web~Page~Content}$
Read the corpus from web page to start processing. Use text in ASCII format (no BOMs) and remove Windows-based newlines '\r'.

In [20]:
def get_corpus_from_url(url):
    with urllib.request.urlopen(url) as fd:
        return fd.read().decode('ascii')


def get_corpus_from_file(file):
    with open(file) as fd:
        return fd.read()


def get_corpus(key):
    def validate_url(url):
        parsed_url = urllib.parse.urlparse(url)
        return all([parsed_url.scheme, parsed_url.netloc, parsed_url.path])

    # Check if a filename was provided
    if os.path.isfile(key):
        return get_corpus_from_file(key)
    else:
        file = os.path.basename(CORPUS_URL.get(key))
        if os.path.isfile(file):
            return get_corpus_from_file(file)

    # Check if a URL was provided
    if validate_url(key):
        return get_corpus_from_url(key)
    else:
        url = CORPUS_URL.get(key)
        if validate_url(url):
            return get_corpus_from_url(url)

###$\color{brown}{\rm Split~into~Parts~and~Chapters}$
CORE METHODS.

In [21]:
def get_gutenberg_start_tag(text):
    """Find Gutenberg's start tag (and producer, if available).

    Notes:
        * re.match() searches at the beginning of strings, but there are
          certain character combinations that are not considered strings,
          and thus need to use re.search(), even if it is at the beginning
          of line. An example are the asterisks in the Gutenberg START
          tag.
    """
    return re.search(
        r'\s*\r?\n'  # pre-whitespace
        r'\*{3} '  # 3 asterisks
        r'start[^\r\n]+'  # tag text
        r' \*{3}'  # 3 asterisks
        r'\r?\n\s*'  # post-whitespace
        r'(produced by.+\r?\n\s*)?',  # producer line with post-whitespace
        text
    )


def get_gutenberg_end_tag(text):
    """Find Gutenberg's end tag (and transcriber's notes, if available).

    Notes:
        * Duplicate/similar Gutenberg end tags.
        * Use a newline before transcriber note to prevent matching similar
          (but indented) notes at beginning of text.
        * Use DOTALL flag to match transcriber's notes across multiple lines.
          But be wary that using DOTALL prevents the use of '.+' for other
          cases, so use '[^\r\n]' instead.
    """
    return re.search(
        r'('
        r'(\s*\r?\noriginal transcriber.+)?'  # transcriber notes with pre-whitespace
        r'\s*\r?\n'  # pre-whitespace
        r'end[^\r\n]+'  # duplicate/similar tag text
        r')?'
        r'\s+'  # pre-whitespace
        r'\*{3} '  # 3 asterisks
        r"end[^\r\n]+"  # tag text
        r' \*{3}'  # 3 asterisks
        r'\r?\n\s*',  # post-whitespace
        text, flags=re.DOTALL
    )


def get_gutenberg_part_labels(text):
    """
    Notes:
        * We consider the start of the text when the first part/chapter starts.
    """
    return list(re.finditer(
        r'\s*\r?\n'  # pre-whitespace
        r'('
        r'part (\d|[ivx])+'  # label with Arabic or Roman numbering
        r'(-+|\.)?'  # label-title delimiter
        r'.*'  # title
        r')'
        r'\r?\n\s*',  # post-whitespace
        text
    ))


def get_gutenberg_chapter_labels(text):
    """
    Notes:
        * Some text have the chapter tag and title in different lines.
    """
    return list(re.finditer(
        r'\s*'  # pre-whitespace
        r'\r?\n'  # no indentation
        r'('
        r'chapter (\d|[ivx])+'  # label with Arabic or Roman numbering
        r'(-+|\.)?'  # label-title delimiter
        r'(\s{2})?'  # whitespace for titles two line apart
        r'.*'  # title
        r')'
        r'\r?\n\s*',  # post-whitespace
        text
    ))


def get_gutenberg_epilogue_label(text):
    return re.search(
        r'\s*\r?\n'  # pre-whitespace
        r'epilogue'  # tag text
        r'\r?\n\s*',  # post-whitespace
        text
    )


def get_toc(text):
    """Table of contents."""
    pass


def get_prologue(text):
    pass


def get_epilogue(text):
    epilogue_label = get_gutenberg_epilogue_label(text)
    if epilogue_label:
        etag = get_gutenberg_end_tag(text)
        return epilogue_label.end(), etag.start()

Utility methods.

In [22]:
def get_part(text, part_num, *, part_labels=None):
    """Get span of a selected part.

    Args:
        part_num (int): Natural number of parts [1-N]
    """
    # NOTE: This can be a required parameter, but simplifies invocation of this function.
    if part_labels is None:
        part_labels = get_gutenberg_part_labels(text)
    if part_num < 1 or part_num > len(part_labels):
        raise Exception('part number out-of-range')

    start = part_labels[part_num - 1].end()
    if part_num == len(part_labels):
        epilogue_label = get_gutenberg_epilogue_label(text)
        end = (
            epilogue_label.start()
            if epilogue_label
            else get_gutenberg_end_tag(text).start()
        )
    else:
        end = part_labels[part_num].start()
    return start, end


def get_parts(text):
    part_labels = get_gutenberg_part_labels(text)
    for part_num in range(1, len(part_labels) + 1):
        yield get_part(text, part_num, part_labels=part_labels)


def get_chapter(text, chapter_num, part_num=None, *, chapter_labels=None, part_labels=None):
    """Get span of chapter.

    Args:
        chapter_num (int): Natural number of chapters [1-N]

        part_num (int): Natural number of parts [1-N]
    """
    # NOTE: This can be a required value. This simplifies invocation of this function.
    if chapter_labels is None:
        chapter_labels = get_gutenberg_chapter_labels(text)
    if chapter_num < 1 or chapter_num > len(chapter_labels):
        raise Exception('chapter number out-of-range')

    if part_labels is None:
        part_labels = get_gutenberg_part_labels(text)
    if part_num is not None and (part_num < 1 or part_num > len(part_labels)):
        raise Exception('part number out-of-range')

    if part_num is not None:
        # Filter chapters not found in selected part
        part = get_part(text, part_num, part_labels=part_labels)
        chapter_labels = [
            label
            for label in chapter_labels
            if label.end() >= part[0] and label.end() <= part[1]
        ]

    # Last chapter
    start = chapter_labels[chapter_num - 1].end()
    if chapter_num == len(chapter_labels):
        # Last chapter of last part
        if part_num is None or part_num == len(part_labels):
            epilogue_label = get_gutenberg_epilogue_label(text)
            end = (
                epilogue_label.start()
                if epilogue_label
                else get_gutenberg_end_tag(text).start()
            )
        # Last chapter of intermediate part
        elif part_num is None:
            end = chapter_labels[chapter_num].start()
        else:
            end = part[1]
    else:
        end = chapter_labels[chapter_num].start()
    return start, end


def get_chapters(text, part_num=None):
    """Get iterator of chapter spans.

    Args:
        part_num (int): Natural number of parts [1-N]
    """
    chapter_labels = get_gutenberg_chapter_labels(text)
    part_labels = get_gutenberg_part_labels(text)

    # Text has parts
    if part_labels:
        if part_num is not None and (part_num < 1 or part_num > len(part_labels)):
            raise Exception('part number out-of-range')

        for part_num in (
            range(1, len(part_labels) + 1)
            if part_num is None
            else range(part_num, part_num + 1)
        ):
            # Filter chapters not found in current part
            part = get_part(text, part_num, part_labels=part_labels)
            _chapter_labels = [
                label
                for label in chapter_labels
                if label.end() >= part[0] and label.end() <= part[1]
            ]
            for chapter_num in range(1, len(_chapter_labels) + 1):
                yield get_chapter(text, chapter_num, part_num, chapter_labels=_chapter_labels, part_labels=part_labels)

    # Text does not has parts
    else:
        if part_num is not None:
            print('Warning: no parts found, so part-related parameters are ignored')
        for chapter_num in range(1, len(chapter_labels) + 1):
            yield get_chapter(text, chapter_num, chapter_labels=chapter_labels)

Get paragraphs

In [23]:
#[\d|"|\w](.+\n)+\n*("(.+\n)+\n+)* --> get paragraphs followed by paragraphs that starts with >>"<<
#[\d|"|\w](.+\n)+(.+:)\n+(.+\n?)+  --> get paragraphs that has >>:<< followed by one more paragraph
#[\d|"|\w](.+\n)+(.+:)\n+(.+\n?)+|[\d|"|\w](.+\n)+\n*("(.+\n)+\n+)* --> union of the previous two

def get_paragraphs(text, span):
    def _get_paragraphs(text):
        return [
            match.span()
            for match in re.finditer(
              r'('
                r'([^\r\n]+\r?\n)+'  # (regular text with newline)+
                r'('
                r'(\r?\n)+'  # (newline)+
                r'[^a-zA-Z]'  # non-alpha character: quote, number, etc.
                r')?'  # handles case of multiple newlines but still same paragraph
                r')+',  # (full regex)+
                text
            )
        ]

    # Get paragraphs from text
    # Add base offset to paragraphs' spans
    paragraphs = [
        (par[0]+span[0], par[1]+span[0])
        for par in _get_paragraphs(text[span[0]:span[1]])
    ]

    # Extend last paragraph to end of text
    if paragraphs:
        paragraphs[-1] = paragraphs[-1][0], span[1]
    else:
        paragraphs = [span]

    return paragraphs

###$\color{brown}{\rm Preprocess~Corpus}$


In [24]:
def print_parts_chapters():
    for title in CORPUS_URL:
        print(title)
        print('-' * len(title))

        corpus = get_corpus(title)
        _corpus = corpus.lower()

        for span in get_parts(_corpus):
            print(span)
            # print(corpus[span[0]:span[0]+100])
            # print('...')
            # print(corpus[span[1]-100:span[1]])

        print()

        for span in get_chapters(_corpus):
            print(span)
            # print(corpus[span[0]:span[0]+100])
            # print('...')
            # print(corpus[span[1]-100:span[1]])

        print()
        print()

In [25]:
print_parts_chapters()

The Valley of Fear
------------------
(771, 153679)
(153721, 315571)

(801, 17931)
(17986, 33434)
(33487, 52429)
(52466, 73638)
(73690, 96082)
(96126, 121931)
(121972, 153679)
(153747, 170681)
(170724, 200879)
(200927, 231808)
(231855, 251712)
(251757, 277395)
(277430, 295740)
(295798, 315571)


A Study of Scarlet
------------------
(1856, 125158)
(125217, 245813)

(2014, 17680)
(17733, 38387)
(38448, 60325)
(60381, 74188)
(74250, 88120)
(88184, 106475)
(106526, 125158)
(125260, 146010)
(146057, 160651)
(160716, 171025)
(171071, 190062)
(190109, 210109)
(210193, 233926)
(233970, 245813)


The Sign of the Four
--------------------

(759, 17754)
(17776, 28385)
(28408, 38358)
(38380, 59609)
(59630, 74129)
(74151, 91615)
(91638, 115314)
(115338, 134278)
(134300, 154062)
(154083, 172511)
(172533, 184243)
(184266, 238563)


The Hound of the Baskervilles
-----------------------------

(745, 13659)
(13717, 37232)
(37272, 53593)
(53643, 76314)
(76363, 95354)
(95399, 114729)
(114790, 141797)
(14

In [26]:
corpus = get_corpus('The Valley of Fear')
_corpus = corpus.lower()

for chapter_num, chapter_span in enumerate(get_chapters(_corpus), start=1):
    print(f'Chapter {chapter_num} - {chapter_span}')
    print('=' * 40)
    for par in get_paragraphs(corpus, chapter_span):
        print(par)
        #print(corpus[par[0]:par[1]])
        #print('-' * 40)
    break

Chapter 1 - (801, 17931)
(801, 896)
(898, 1099)
(1101, 1748)
(1750, 4841)
(4843, 6300)
(6302, 7328)
(7330, 8695)
(8697, 13036)
(13038, 14372)
(14374, 15155)
(15157, 15451)
(15453, 16042)
(16044, 17305)
(17307, 17756)
(17758, 17931)


In [27]:
list(get_chapters(_corpus))

[(801, 17931),
 (17986, 33434),
 (33487, 52429),
 (52466, 73638),
 (73690, 96082),
 (96126, 121931),
 (121972, 153679),
 (153747, 170681),
 (170724, 200879),
 (200927, 231808),
 (231855, 251712),
 (251757, 277395),
 (277430, 295740),
 (295798, 315571)]