<a href="https://colab.research.google.com/github/edponce/DoyleInvestigators/blob/master/Project1_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import re
import urllib.request
import urllib.parse


###$\color{brown}{\rm Corpus~Selection}$

In [4]:
CORPUS_URL = {
    'The Valley of Fear': "http://www.gutenberg.org/files/3289/3289.txt",
    'A Study of Scarlet': "http://www.gutenberg.org/files/244/244.txt",
    'The Sign of the Four': "http://www.gutenberg.org/files/2097/2097.txt",
    'The Hound of the Baskervilles': "http://www.gutenberg.org/files/2852/2852.txt",
}

###$\color{brown}{\rm Read~Web~Page~Content}$
Read the corpus from web page to start processing. Use text in ASCII format (no BOMs) and remove Windows-based newlines '\r'.

In [5]:
def get_corpus_from_url(url):
    with urllib.request.urlopen(url) as fd:
        # return fd.read().decode('ascii').replace('\r', '')
        return fd.read().decode('ascii')


def get_corpus_from_file(file):
    with open(file) as fd:
        # return fd.read().replace('\r', '')
        return fd.read()


def get_corpus(key):
    def validate_url(url):
        parsed_url = urllib.parse.urlparse(url)
        return all([parsed_url.scheme, parsed_url.netloc, parsed_url.path])

    # Check if a filename was provided
    if os.path.isfile(key):
        return get_corpus_from_file(key)
    else:
        file = os.path.basename(CORPUS_URL.get(key))
        if os.path.isfile(file):
            return get_corpus_from_file(file)

    # Check if a URL was provided
    if validate_url(key):
        return get_corpus_from_url(key)
    else:
        url = CORPUS_URL.get(key)
        if validate_url(url):
            return get_corpus_from_url(url)

###$\color{brown}{\rm Split~into~Parts~and~Chapters}$
CORE METHODS.

In [6]:
def get_gutenberg_start_tag(text):
    """Find Gutenberg's start tag (and producer, if available).

    Notes:
        * re.match() searches at the beginning of strings, but there are
          certain character combinations that are not considered strings,
          and thus need to use re.search(), even if it is at the beginning
          of line. An example are the asterisks in the Gutenberg START
          tag.
    """
    return re.search(r'\s+\*{3} START (\w+ )+\*{3}\s+(Produced by.+\s+)?', text)


def get_gutenberg_end_tag(text):
    """Find Gutenberg's end tag (and transcriber's notes, if available).

    Notes:
        * Duplicate/similar Gutenberg end tags.
        * Use a newline before transcriber note to prevent matching similar
          (but indented) notes at beginning of text.
        * Use DOTALL flag to match footnotes across multiple lines.
          Be wary of other regex parts that use dot operator, for example,
          change '.+' to '[^\n]+'.
    """
    return re.search(r'((\s+\r?\nOriginal transcriber.*)?\s+End[^\r\n]+)?\s+\*{3} END (\w+ )+\*{3}\s+', text, flags=re.I|re.DOTALL)


def get_gutenberg_part_labels(text):
    """
    Notes:
        * We consider the start of the text when the first part/chapter starts.
        * Arabic or Roman numbering.
    """
    return list(re.finditer(r'\r?\n+(Part (\d+|[IVX]+)(--|\.)?.*)\s+', text, flags=re.I))


def get_gutenberg_chapter_labels(text):
    """
    Notes:
        * Arabic or Roman numbering.
        * Some text have the chapter tag and title in different lines.
    """
    return list(re.finditer(r'\r?\n+(Chapter (\d+|[IVX]+)(--|\.)?(.+|\s+.+))\s+', text, flags=re.I))

Utility methods.

In [7]:
def get_part(corpus, n, part_labels=None):
    """Get span of part.

    Args:
        n (int): Natural number of parts [1-N]
    """
    if part_labels is None:
        part_labels = get_gutenberg_part_labels(corpus)

    #get final chapter
    if len(part_labels) == n:
        etag = get_gutenberg_end_tag(corpus)
        return (part_labels[n-1].end(), etag.start())
    elif len(part_labels) > n:
        return (part_labels[n-1].end(), part_labels[n].start())


def get_parts(corpus, part_labels=None):
    if part_labels is None:
        part_labels = get_gutenberg_part_labels(corpus)

    for n in range(1, len(part_labels) + 1):
        yield get_part(corpus, n, part_labels)


def get_chapter(corpus, n, part_num=None, chapter_labels=None):
    """Get span of part.

    Args:
        n (int): Natural number of chapters [1-N]

        part_num (int): Natural number of parts [1-N]
    """
    if chapter_labels is None:
        chapter_labels = get_gutenberg_chapter_labels(corpus)

    #get final chapter
    if len(chapter_labels) == n:
        etag = get_gutenberg_end_tag(corpus)
        return (chapter_labels[n-1].end(), etag.start())
    elif len(chapter_labels) > n:
        return (chapter_labels[n-1].end(), chapter_labels[n].start())


def get_chapters(corpus, part_num=None, chapter_labels=None):
    if chapter_labels is None:
        chapter_labels = get_gutenberg_chapter_labels(corpus)

    for n in range(1, len(chapter_labels) + 1):
        yield get_chapter(corpus, n, chapter_labels)

###$\color{brown}{\rm Preprocess~Corpus}$


In [40]:
def preprocess_text(key):
    corpus = get_corpus(key)

    #match pattern for parts
    part_labels = get_gutenberg_part_labels(corpus)
    for label in part_labels:
       print(label.group(1))
       print(label.span())
    
    #get chapters
    chapter_labels = get_gutenberg_chapter_labels(corpus)
    print(len(chapter_labels))
    for label in chapter_labels:
       print(label.group(1))
       print(label.span())

    start = get_gutenberg_start_tag(corpus)
    print(start)

    end = get_gutenberg_end_tag(corpus)
    print(end)
    
    print()
    print()

    return corpus

In [None]:
for title in CORPUS_URL:
    _ = preprocess_text(title)
#preprocess_text('The Valley of Fear')

In [None]:
corpus = get_corpus('The Valley of Fear')
for span in get_parts(corpus):
    print(span)
print()

for span in get_chapters(corpus):
    print(span)

(771, 153689)
(153721, 319492)
(801, 17939)
(17986, 33442)
(33487, 52437)
(52466, 73646)
(73690, 96090)
(96126, 121939)
(121972, 153719)
(153747, 170689)
(170724, 200887)
(200927, 231816)
(231855, 251720)
(251757, 277403)
(277430, 295748)
(295798, 319492)
