# Metadata

```yaml
Course:    DS 5001 
Module:    Final Project
Author:    Chris Longchamp
Date:      2 May 2023
```


### Preprocessing Data

In [1]:
data_home = "../DS5001"
local_lib = "../DS5001/lib"
source_files = f'{data_home}/final-set'
data_prefix = 'final'

In [2]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import sys
sys.path.append(local_lib)
from textparser import TextParser

In [3]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (805,   rf"^\s*CHAPTER\d+\."),
    (4368,  rf"^\s+{roman}\s*"),
    (64317,  rf"^\s*Chapter\s+{roman}\."),
    (6695, rf"^[A-Z\s]+$"),
    (68229, rf"^[A-Z\s]+$"),
    (144, rf"^\s*Chapter\s+{roman}"),
    (1245, rf"^\s*Chapter\s+{roman}"),
    (5670, rf"^[A-Z\s]+$"),
    (29220, rf"^[A-Z\s]+$"),
    (61085, rf"^\s*chapter\d+"),
    (63022, rf"^[MR. BENNETT AND MRS. BROWN]\[\d]"),
    (63107, rf"^[MRS DALLOWAY IN BOND STREET\s]+$"),
    (64457, rf"^__[A-Z]__+$"),
    (67138, rf"^\s*CHAPTER\n\d+"),
    (69683, rf"^[A-Z\s]+$")
]

In [4]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [5]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('\\')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

In [6]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [7]:
LIB

Unnamed: 0_level_0,source_file_path,raw_title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
144,../DS5001/final-set\VIRGINIA_WOOLF_THE_VOYAGE_...,VIRGINIA WOOLF THE VOYAGE OUT
805,../DS5001/final-set\F.SCOTT_FITZGERALD_THIS_SI...,F.SCOTT FITZGERALD THIS SIDE OF PARADISE
1245,../DS5001/final-set\VIRGINIA_WOOLF_NIGHT_AND_D...,VIRGINIA WOOLF NIGHT AND DAY
4368,../DS5001/final-set\F.SCOTT_FITZGERALD_FLAPPER...,F.SCOTT FITZGERALD FLAPPERS AND PHILOSOPHERS
5670,../DS5001/final-set\VIRGINIA_WOOLF_JACOBS_ROOM...,VIRGINIA WOOLF JACOBS ROOM
6695,../DS5001/final-set\F.SCOTT_FITZGERALD_TALES_O...,F.SCOTT FITZGERALD TALES OF THE JAZZ AGE
29220,../DS5001/final-set\VIRGINIA_WOOLF_MONDAY_OR_T...,VIRGINIA WOOLF MONDAY OR TUESDAY
61085,../DS5001/final-set\ERNEST_HEMINGWAY_IN_OUR_TI...,ERNEST HEMINGWAY IN OUR TIME
63022,../DS5001/final-set\VIRGINIA_WOOLF_MR_BENNETT_...,VIRGINIA WOOLF MR BENNETT AND MRS BROWN
63107,../DS5001/final-set\VIRGINIA_WOOLF_MRS_DALLOWA...,VIRGINIA WOOLF MRS DALLOWAY IN BOND STREET


In [8]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [9]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [10]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
144,../DS5001/final-set\VIRGINIA_WOOLF_THE_VOYAGE_...,"VIRGINIA, WOOLF",THE VOYAGE OUT,^\s*Chapter\s+[IVXLCM]+
805,../DS5001/final-set\F.SCOTT_FITZGERALD_THIS_SI...,"F.SCOTT, FITZGERALD",THIS SIDE OF PARADISE,^\s*CHAPTER\d+\.
1245,../DS5001/final-set\VIRGINIA_WOOLF_NIGHT_AND_D...,"VIRGINIA, WOOLF",NIGHT AND DAY,^\s*Chapter\s+[IVXLCM]+
4368,../DS5001/final-set\F.SCOTT_FITZGERALD_FLAPPER...,"F.SCOTT, FITZGERALD",FLAPPERS AND PHILOSOPHERS,^\s+[IVXLCM]+\s*
5670,../DS5001/final-set\VIRGINIA_WOOLF_JACOBS_ROOM...,"VIRGINIA, WOOLF",JACOBS ROOM,^[A-Z\s]+$
6695,../DS5001/final-set\F.SCOTT_FITZGERALD_TALES_O...,"F.SCOTT, FITZGERALD",TALES OF THE JAZZ AGE,^[A-Z\s]+$
29220,../DS5001/final-set\VIRGINIA_WOOLF_MONDAY_OR_T...,"VIRGINIA, WOOLF",MONDAY OR TUESDAY,^[A-Z\s]+$
61085,../DS5001/final-set\ERNEST_HEMINGWAY_IN_OUR_TI...,"ERNEST, HEMINGWAY",IN OUR TIME,^\s*chapter\d+
63022,../DS5001/final-set\VIRGINIA_WOOLF_MR_BENNETT_...,"VIRGINIA, WOOLF",MR BENNETT AND MRS BROWN,^[MR. BENNETT AND MRS. BROWN]\[\d]
63107,../DS5001/final-set\VIRGINIA_WOOLF_MRS_DALLOWA...,"VIRGINIA, WOOLF",MRS DALLOWAY IN BOND STREET,^[MRS DALLOWAY IN BOND STREET\s]+$


In [11]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [12]:
CORPUS = tokenize_collection(LIB)

Tokenizing 144 THE VOYAGE OUT
Importing  ../DS5001/final-set\VIRGINIA_WOOLF_THE_VOYAGE_OUT-pg144.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*Chapter\s+[IVXLCM]+
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model


AttributeError: 'Series' object has no attribute 'stack'