<a href="https://colab.research.google.com/github/edponce/DoyleInvestigators/blob/master/Project1_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import urllib.request
import urllib.parse

###$\color{brown}{\rm Corpus~Selection}$

In [2]:
CORPUS_URL = {
    'The Valley of Fear': "http://www.gutenberg.org/files/3289/3289.txt",
    'A Study of Scarlet': "http://www.gutenberg.org/files/244/244.txt",
    'The Sign of the Four': "http://www.gutenberg.org/files/2097/2097.txt",
    'The Hound of the Baskervilles': "http://www.gutenberg.org/files/2852/2852.txt",
}

###$\color{brown}{\rm Read~Web~Page~Content}$
Read the corpus from web page to start processing. Use text in ASCII format (no BOMs) and remove Windows-based newlines '\r'.

In [3]:
def get_corpus_from_url(url):
    with urllib.request.urlopen(url) as fd:
        # return fd.read().decode('ascii').replace('\r', '')
        return fd.read().decode('ascii')


def get_corpus_from_file(file):
    with open(file) as fd:
        # return fd.read().replace('\r', '')
        return fd.read()


def get_corpus(key):
    def validate_url(url):
        parsed_url = urllib.parse.urlparse(url)
        return all([parsed_url.scheme, parsed_url.netloc, parsed_url.path])

    # Check if a filename was provided
    if os.path.isfile(key):
        return get_corpus_from_file(key)
    else:
        file = os.path.basename(CORPUS_URL.get(key))
        if os.path.isfile(file):
            return get_corpus_from_file(file)

    # Check if a URL was provided
    if validate_url(key):
        return get_corpus_from_url(key)
    else:
        url = CORPUS_URL.get(key)
        if validate_url(url):
            return get_corpus_from_url(url)

###$\color{brown}{\rm Split~into~Parts~and~Chapters}$
Simple regex expressions.

In [4]:
def get_gutenberg_start_tag(text):
    """Find Gutenberg's start tag (and producer, if available).

    Notes:
        * re.match() searches at the beginning of strings, but there are
          certain character combinations that are not considered strings,
          and thus need to use re.search(), even if it is at the beginning
          of line. An example are the asterisks in the Gutenberg START
          tag.
    """
    return re.search(r'\s+\*{3} START (\w+ )+\*{3}\s+(Produced by.+\s+)?', text)


def get_gutenberg_end_tag(text):
    """Find Gutenberg's end tag (and transcriber's notes, if available).

    Notes:
        * Duplicate/similar Gutenberg end tags.
        * Use a newline before transcriber note to prevent matching similar
          (but indented) notes at beginning of text.
        * Use DOTALL flag to match footnotes across multiple lines.
          Be wary of other regex parts that use dot operator, for example,
          change '.+' to '[^\n]+'.
    """
    return re.search(r'((\s+\r?\nOriginal transcriber.*)?\s+End[^\r\n]+)?\s+\*{3} END (\w+ )+\*{3}\s+', text, flags=re.I|re.DOTALL)


def get_gutenberg_part_labels(text):
    """
    Notes:
        * We consider the start of the text when the first part/chapter starts.
        * Arabic or Roman numbering.
    """
    return list(re.finditer(r'\r?\n+(Part (\d+|[IVX]+)(--|\.)?.*)\s+', text, flags=re.I))


def get_gutenberg_chapter_labels(text):
    """
    Notes:
        * Arabic or Roman numbering.
        * Some text have the chapter tag and title in different lines.
    """
    return list(re.finditer(r'\r?\n+(Chapter (\d+|[IVX]+)(--|\.)?(.+|\s+.+))\s+', text, flags=re.I))

In [6]:
def get_part(corpus, n, part_labels=None):
    """Get span of part.

    Args:
        n (int): Natural number of parts [1-N]
    """
    if part_labels is None:
        part_labels = get_gutenberg_part_labels(corpus)

    if len(part_labels) == n:
        etag = get_gutenberg_end_tag(corpus)
        return (part_labels[n-1].end(), etag.start())
    elif len(part_labels) > n:
        return (part_labels[n-1].end(), part_labels[n].start())


def get_parts(corpus, part_labels=None):
    if part_labels is None:
        part_labels = get_gutenberg_part_labels(corpus)

    for n in range(1, len(part_labels) + 1):
        yield get_part(corpus, n, part_labels)


def get_chapter(corpus, n, part_num=None, chapter_labels=None):
    """Get span of part.

    Args:
        n (int): Natural number of chapters [1-N]

        part_num (int): Natural number of parts [1-N]
    """
    if chapter_labels is None:
        chapter_labels = get_gutenberg_chapter_labels(corpus)

    if len(chapter_labels) == n:
        etag = get_gutenberg_end_tag(corpus)
        return (chapter_labels[n-1].end(), etag.start())
    elif len(chapter_labels) > n:
        return (chapter_labels[n-1].end(), chapter_labels[n].start())


def get_chapters(corpus, part_num=None, chapter_labels=None):
    if chapter_labels is None:
        chapter_labels = get_gutenberg_chapter_labels(corpus)

    for n in range(1, len(chapter_labels) + 1):
        yield get_chapter(corpus, n, chapter_labels)

In [10]:
def preprocess_text(key):
    corpus = get_corpus(key)

    part_labels = get_gutenberg_part_labels(corpus)
    print(len(part_labels))
    for label in part_labels:
       print(label.group(1))
       print(label.span())

    chapter_labels = get_gutenberg_chapter_labels(corpus)
    print(len(chapter_labels))
    for label in chapter_labels:
       print(label.group(1))
       print(label.span())

    start = get_gutenberg_start_tag(corpus)
    print(start)

    end = get_gutenberg_end_tag(corpus)
    print(end)

    return corpus

In [11]:
for title in CORPUS_URL:
    _ = preprocess_text(title)

2
Part 1--The Tragedy of Birlstone
(727, 771)
Part 2--The Scowrers
(153689, 153721)
14
(769, 801)
Chapter 2--Sherlock Holmes Discourses
(17939, 17986)
Chapter 3--The Tragedy of Birlstone
(33442, 33487)
Chapter 4--Darkness
(52437, 52466)
Chapter 5--The People of the Drama
(73646, 73690)
Chapter 6--A Dawning Light
(96090, 96126)
Chapter 7--The Solution
(121939, 121972)
Chapter 1--The Man
(153719, 153747)
Chapter 2--The Bodymaster
(170689, 170724)
Chapter 3--Lodge 341, Vermissa
(200887, 200927)
Chapter 4--The Valley of Fear
(231816, 231855)
Chapter 5--The Darkest Hour
(251720, 251757)
Chapter 6--Danger
(277403, 277430)
Chapter 7--The Trapping of Birdy Edwards
(295748, 295798)
<_sre.SRE_Match object; span=(557, 672), match='\r\n\r\n*** START OF THIS PROJECT GUTENBERG EBOOK>
<_sre.SRE_Match object; span=(319492, 319646), match="\r\n\r\n\r\n\r\n\r\n\r\nEnd of Project Gutenberg'>
2
PART I.
(1843, 1856)
PART II. _The Country of the Saints._
(125168, 125217)
14
CHAPTER I. MR. SHE

In [7]:
corpus = get_corpus('The Valley of Fear')
for span in get_parts(corpus):
    print(span)
print()

for span in get_chapters(corpus):
    print(span)

(771, 153689)
(153721, 319492)
(801, 17939)
(17986, 33442)
(33487, 52437)
(52466, 73646)
(73690, 96090)
(96126, 121939)
(121972, 153719)
(153747, 170689)
(170724, 200887)
(200927, 231816)
(231855, 251720)
(251757, 277403)
(277430, 295748)
(295798, 319492)


In [None]:
#structure description:
'''
dict_sections = {
                    "part_1": {
                        "title":"The Tragedy of Birlstone",
                        "chapter_titles":['The Warning\r', 'Sherlock Holmes Discourses\r', 'The Tragedy of Birlstone\r', ...],
                        "chapters":["I believe that I am one of th...","It was one of those dramati...", ...]
                    },

                    "part_2": {
                        "title":"The Scowrers",
                        "chapter_titles":['The Warning\r', 'Sherlock Holmes Discourses\r', 'The Tragedy of Birlstone\r', ...],
                        "chapters":["I believe that I am one of th...","It was one of those dramati...", ...]
                    },
                    
                    ...,

                    "part_n":{
                        "title":"xyz",
                        "chapter_titles":['...', '...', ...],
                        "chapters":["...", "...", ...]
                    }
                }
'''

#dictionary containing the main parts of the novel, every part contains a list of chapters
dict_sections = {}


#Get the trimmed text of the novel
#--------------------------
#flags=re.I -> ignore case
start_novel_position = re.search('Part \d-', main_text, flags=re.I).span()[0]
end_novel_position = re.search('End of Project Gutenberg', main_text, flags=re.I).span()[0]

#trim text
main_text_trimmed = main_text[start_novel_position:end_novel_position].strip()
#--------------------------


#Get the parts of the novel
#--------------------------
list_text_parts = re.split("Part \d--", main_text_trimmed, flags=re.I) 

#the first item on the list is empty, this line gets rid of the empty item
list_text_parts = [part for part in list_text_parts if len(part) > 0] 
#--------------------------


#Get the chapters 
#--------------------------
#iterate over the 'Parts' to get chapters
for part_index, part in enumerate(list_text_parts):

    #get 'Part' title and 'Part' text
    linefeed_title = re.search('\n', part).span()[0]
    part_title = part[0:linefeed_title]
    part_text = part[linefeed_title:len(part)].strip()
    
    #get chapters list
    list_chapters = re.split("Chapter \d--", part_text, flags=re.I)
    list_chapters = [chapter for chapter in list_chapters if len(chapter) > 0] 

    #iterate over every chapter
    title_list = []
    chapter_list = []
    for chapter in list_chapters:
        #get 'Chapter' title and 'Chapter' text
        linefeed_title_ch = re.search('\n', chapter).span()[0]
        chapter_title = chapter[0:linefeed_title_ch]
        chapter_text = chapter[linefeed_title_ch:len(chapter)].strip()
        
        #add to lists
        title_list.append(chapter_title)
        chapter_list.append(chapter_text)

    #create a dictionary for this part
    part_dictionary = {"part_title":part_title,
                       "chapter_titles":title_list,
                       "chapters":chapter_list}

    #add to the main dictionary
    dict_sections["part_"+str(part_index+1)] = part_dictionary
#--------------------------


print(dict_sections["part_2"]["part_title"])

The Scowrers
