# Metadata

```yaml
Course:    DS 5001 
Module:    Final Project
Author:    Chris Longchamp
Date:      2 May 2023
```


## Preprocessing Data

In [1]:
data_home = "../DS5001"
local_lib = "../DS5001/lib"
source_files = f'{data_home}/NLPProjectGutenberg/final-set'
data_prefix = 'final'

### Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import sys
sys.path.append(local_lib)
from textparser import TextParser

### Setting Chapter Pats

In [3]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (805,   rf"^\s*CHAPTER\s\d+\. | INTERLUDE"),
    (4368,  rf"^\s*CHAPTER\s+{roman}$"),
    (64317,  rf"^\s*{roman}$"),
    (6695, rf"^^\s*CHAPTER\s+{roman}"),
    (144, rf"^\s*CHAPTER\s+{roman}"),
    (1245, rf"^\s*CHAPTER\s+{roman}"),
    (5670, rf"^\s*CHAPTER\s"),
    (29220, rf"^\b[A-Z\s]+\b$"),
    (61085, rf"^\s*chapter\s*\d+\s*"),
    (63022, rf"^\s*Chapter\s\d+"),
    (63107, rf"^[MRS DALLOWAY IN BOND STREET\s]+$"),
    (67138, rf"^\s*CHAPTER\s+\d+$"),
    (69683, rf"^[A-Z\s]+$")
]

In [4]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [5]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('\\')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

### Creating LIB Table

In [6]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [7]:
LIB.shape

(13, 2)

In [8]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [9]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [10]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
144,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",THE VOYAGE OUT,^\s*CHAPTER\s+[IVXLCM]+
805,../DS5001/NLPProjectGutenberg/final-set\F.SCOT...,"F.SCOTT, FITZGERALD",THIS SIDE OF PARADISE,^\s*CHAPTER\s\d+\. | INTERLUDE
1245,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",NIGHT AND DAY,^\s*CHAPTER\s+[IVXLCM]+
4368,../DS5001/NLPProjectGutenberg/final-set\F.SCOT...,"F.SCOTT, FITZGERALD",FLAPPERS AND PHILOSOPHERS,^\s*CHAPTER\s+[IVXLCM]+$
5670,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",JACOBS ROOM,^\s*CHAPTER\s
6695,../DS5001/NLPProjectGutenberg/final-set\F.SCOT...,"F.SCOTT, FITZGERALD",TALES OF THE JAZZ AGE,^^\s*CHAPTER\s+[IVXLCM]+
29220,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",MONDAY OR TUESDAY,^\b[A-Z\s]+\b$
61085,../DS5001/NLPProjectGutenberg/final-set\ERNEST...,"ERNEST, HEMINGWAY",IN OUR TIME,^\s*chapter\s*\d+\s*
63022,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",MR BENNETT AND MRS BROWN,^\s*Chapter\s\d+
63107,../DS5001/NLPProjectGutenberg/final-set\VIRGIN...,"VIRGINIA, WOOLF",MRS DALLOWAY IN BOND STREET,^[MRS DALLOWAY IN BOND STREET\s]+$


In [11]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

### Tokenizing the Corpus

In [12]:
CORPUS = tokenize_collection(LIB)

Tokenizing 144 THE VOYAGE OUT
Importing  ../DS5001/NLPProjectGutenberg/final-set\VIRGINIA_WOOLF_THE_VOYAGE_OUT-pg144.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\s+[IVXLCM]+
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 805 THIS SIDE OF PARADISE
Importing  ../DS5001/NLPProjectGutenberg/final-set\F.SCOTT_FITZGERALD_THIS_SIDE_OF_PARADISE-pg805.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\s\d+\. | INTERLUDE
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 1245 NIGHT AND DAY
Importing  ../DS5001/NLPProjectGutenberg/final-set\VIRGINIA_WOOLF_NIGHT_AND_DAY-pg1245.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*CHAPTER\

In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
144,1,1,0,0,"(As, IN)",IN,As,as
144,1,1,0,1,"(the, DT)",DT,the,the
144,1,1,0,2,"(streets, NNS)",NNS,streets,streets
144,1,1,0,3,"(that, WDT)",WDT,that,that
144,1,1,0,4,"(lead, VBP)",VBP,lead,lead
...,...,...,...,...,...,...,...,...
69683,14,89,10,15,"(it, PRP)",PRP,it,it
69683,14,89,10,16,"(would, MD)",MD,would,would
69683,14,89,10,17,"(fix, VB)",VB,fix,fix
69683,14,89,10,18,"(up, RP)",RP,up,up
