# Metadata

```yaml
Course:    DS 5001 
Module:    Final Project
Author:    Chris Longchamp
Date:      2 May 2023
```


### Preprocessing Data

In [1]:
data_home = "../DS5001"
local_lib = "../DS5001/lib"
source_files = f'{data_home}/NLPProjectGutenberg/final-set'
data_prefix = 'final'

In [2]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import sys
sys.path.append(local_lib)
from textparser import TextParser

In [3]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (805,   rf"^\s*CHAPTER\s\d+\. | INTERLUDE"),
    (4368,  rf"^\s*CHAPTER\s+{roman}$"),
    (64317,  rf"^\s*{roman}$"),
    (6695, rf"^^\s*CHAPTER\s+{roman}"),
    (68229, rf"^[A-Z\s]+$"),
    (144, rf"^\s*CHAPTER\s+{roman}"),
    (1245, rf"^\s*CHAPTER\s+{roman}"),
    (5670, rf"^\s*CHAPTER\s"),
    (29220, rf"^[A-Z\s]+$"),
    (61085, rf"^\s*chapter\s*\d+\s*"),
    (63022, rf"^\s*Chapter\s\d+"),
    (63107, rf"^[MRS DALLOWAY IN BOND STREET\s]+$"),
    (64457, rf"^_([A-Za-z\s]+)_+$"),
    (67138, rf"^\s*CHAPTER\n\s*\d+$"),
    (69683, rf"^[A-Z\s]+$")
]

In [4]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [5]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('\\')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

In [6]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [7]:
LIB.shape

(15, 2)

In [8]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [9]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [10]:
LIB.iloc[[0]]

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
144,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",THE VOYAGE OUT,^\s*CHAPTER\s+[IVXLCM]+


In [11]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [12]:
CORPUS = tokenize_collection(LIB.iloc[[5]])

Tokenizing 6695 TALES OF THE JAZZ AGE
Importing  ../DS5001/NLPProjectGutenberg/final-set\F.SCOTT_FITZGERALD_TALES_OF_THE_JAZZ_AGE-pg6695.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^^\s*CHAPTER\s+[IVXLCM]+
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Done


In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6695,1,1,0,0,"(Jim, NNP)",NNP,Jim,jim
6695,1,1,0,1,"(Powell, NNP)",NNP,Powell,powell
6695,1,1,0,2,"(was, VBD)",VBD,was,was
6695,1,1,0,3,"(a, DT)",DT,a,a
6695,1,1,0,4,"(Jelly, NNP)",NNP,Jelly,jelly
6695,...,...,...,...,...,...,...,...
6695,62,24,0,12,"(they, PRP)",PRP,they,they
6695,62,24,0,13,"(made, VBD)",VBD,made,made
6695,62,24,0,14,"(were, VBD)",VBD,were,were
6695,62,24,0,15,"(as, IN)",IN,as,as
