# HW 04

Charlie Perez (cwp5xyj)

## Part 1: setting things up

In [15]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px
import configparser

In [16]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [17]:
source_files = f'{data_home}/eliot-set'
data_prefix = 'eliot'

In [18]:
local_lib = '/home/cwp5xyj/Documents/MSDS/DS5001/repo-main/lessons/lib'
import sys
import os
sys.path.append(os.path.abspath(local_lib))

In [19]:
local_lib

'/home/cwp5xyj/Documents/MSDS/DS5001/repo-main/lessons/lib'

In [20]:
from textparser import TextParser

In [21]:
# define OHCO patterns

roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"

# after some searching, I think this should work
ohco_pat_list = [
    (507, rf"^(?:Chapter\s+{roman}|Epilogue)\.?\s*$"),
    (145,  rf"^(?:PRELUDE|CHAPTER|FINALE)"),
    (6688, rf"^\s*Chapter\s+{roman}\.?\s*$")
]

In [22]:
source_file_list = sorted(glob(f"{source_files}/*.*"))
source_file_list

['/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt',
 '/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt',
 '/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE_FLOSS-pg6688.txt']

In [23]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('/')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

book_data

[(507,
  '/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt',
  'ELIOT GEORGE ADAM BEDE'),
 (145,
  '/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt',
  'ELIOT GEORGE MIDDLEMARCH'),
 (6688,
  '/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE_FLOSS-pg6688.txt',
  'ELIOT GEORGE THE MILL ON THE FLOSS')]

In [24]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()
LIB

Unnamed: 0_level_0,source_file_path,raw_title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
145,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,ELIOT GEORGE MIDDLEMARCH
507,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,ELIOT GEORGE ADAM BEDE
6688,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,ELIOT GEORGE THE MILL ON THE FLOSS


In [25]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [26]:
LIB

Unnamed: 0_level_0,source_file_path,author,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
145,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",MIDDLEMARCH
507,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",ADAM BEDE
6688,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",THE MILL ON THE FLOSS


In [27]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
145,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",MIDDLEMARCH,^(?:PRELUDE|CHAPTER|FINALE)
507,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",ADAM BEDE,^(?:Chapter\s+[IVXLCM]+|Epilogue)\.?\s*$
6688,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.?\s*$


In [31]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]
    

    books = []
    for book_id in LIB.index:
        if book_id == 507:
            clip_pats = [
                r"^(?:Epilogue)$",
                r"\*\*\*\s*END OF"
            ]
        else:
            clip_pats = [
                r"\*\*\*\s*START OF",
                r"\*\*\*\s*END OF"
            ]
        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
        # text = TextImporter(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats) 

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [32]:
CORPUS = tokenize_collection(LIB)

Tokenizing 145 MIDDLEMARCH
Importing  /home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_MIDDLEMARCH-pg145.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^(?:PRELUDE|CHAPTER|FINALE)
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 507 ADAM BEDE
Importing  /home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_ADAM_BEDE-pg507.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^(?:Chapter\s+[IVXLCM]+|Epilogue)\.?\s*$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 6688 THE MILL ON THE FLOSS
Importing  /home/cwp5xyj/Documents/MSDS/DS5001/data/eliot-set/ELIOT_GEORGE_THE_MILL_ON_THE_FLOSS-pg6688.txt
Clipping text
Parsing OHCO level 0 chap_id by

In [33]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
145,1,1,0,0,"(Who, WP)",WP,Who,who
145,1,1,0,1,"(that, WDT)",WDT,that,that
145,1,1,0,2,"(cares, VBZ)",VBZ,cares,cares
145,1,1,0,3,"(much, RB)",RB,much,much
145,1,1,0,4,"(to, TO)",TO,to,to


This looks like it was successful, I'm just not totally sure what the best way to check my work here is. 

In [34]:
CORPUS.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6688,58,69,0,2,"(death, NN)",NN,death,death
6688,58,69,0,3,"(they, PRP)",PRP,they,they
6688,58,69,0,4,"(were, VBD)",VBD,were,were
6688,58,69,0,5,"(not, RB)",RB,not,not
6688,58,69,0,6,"(divided.”, JJ)",JJ,divided.”,divided


In [35]:
CORPUS.index[400000] # just to get something from the middle

(507, 18, 81, 6, 18)

Feels totally acceptable to me.

In [36]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()
LIB.sort_values('book_len')

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6688,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.?\s*$,207461
507,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",ADAM BEDE,^(?:Chapter\s+[IVXLCM]+|Epilogue)\.?\s*$,215403
145,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",MIDDLEMARCH,^(?:PRELUDE|CHAPTER|FINALE),317805


In [37]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()
LIB.sort_values('n_chaps')

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
507,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",ADAM BEDE,^(?:Chapter\s+[IVXLCM]+|Epilogue)\.?\s*$,215403,56
6688,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.?\s*$,207461,58
145,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",MIDDLEMARCH,^(?:PRELUDE|CHAPTER|FINALE),317805,88


Yessssss fixed the problem with *Adam Bede*. That's so so awesome yay

In [38]:
chapter_lengths = CORPUS.groupby(['book_id', 'chap_id']).size()
chapter_lengths.sort_values().head()

book_id  chap_id
507      47         343
145      1          501
         80         855
6688     1          898
507      55         970
dtype: int64

In [40]:
CORPUS[CORPUS.term_str == '']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
145,5,46,1,0,"(…, NN)",NN,…,
145,6,0,0,39,"(…, NNP)",NNP,…,
145,30,33,5,0,"():, VB)",VB,):,
145,39,55,4,8,"(…, NNP)",NNP,…,
145,43,9,2,17,"(&, CC)",CC,&,
145,59,21,3,0,"();, NN)",NN,);,
145,59,53,1,0,"();, IN)",IN,);,
145,72,3,1,0,"(;”, NNS)",NNS,;”,
145,88,4,0,26,"(&, CC)",CC,&,
507,21,11,2,68,"((&), NNP)",NNP,(&),


In [41]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [43]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
145,1,1,0,0,"(Who, WP)",WP,Who,who,WP
145,1,1,0,1,"(that, WDT)",WDT,that,that,WD
145,1,1,0,2,"(cares, VBZ)",VBZ,cares,cares,VB
145,1,1,0,3,"(much, RB)",RB,much,much,RB
145,1,1,0,4,"(to, TO)",TO,to,to,TO


In [47]:
# vocabulary time

VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

VOCAB.head(10)

Unnamed: 0_level_0,n,n_chars,p,i
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,1,1e-06,19.49843
1790,1,4,1e-06,19.49843
1799,2,4,3e-06,18.49843
1801more,1,8,1e-06,19.49843
1807,1,4,1e-06,19.49843
1825,1,4,1e-06,19.49843
1826,1,4,1e-06,19.49843
1828,1,4,1e-06,19.49843
1829,2,4,3e-06,18.49843
1831,1,4,1e-06,19.49843


In [48]:
# part of speech maxxing

VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

VOCAB.head()

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1,1,1e-06,19.49843,CD,CD,1,{CD},1,{CD}
1790,1,4,1e-06,19.49843,CD,CD,1,{CD},1,{CD}
1799,2,4,3e-06,18.49843,CD,CD,1,{CD},1,{CD}
1801more,1,8,1e-06,19.49843,CD,CD,1,{CD},1,{CD}
1807,1,4,1e-06,19.49843,CD,CD,1,{CD},1,{CD}


In [49]:
# add stopwords

sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

VOCAB[VOCAB.stop == 1].sample(10)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
between,375,7,0.000506,10.947684,IN,IN,2,"{NN, IN}",3,"{NNP, NN, IN}",1
or,1511,2,0.00204,8.937143,CC,CC,6,"{IN, CC, RB, JJ, RP, NN}",7,"{IN, CC, RB, JJ, RP, NN, NNP}",1
ours,21,4,2.8e-05,15.106113,NN,NN,5,"{IN, PR, VB, JJ, NN}",7,"{VBN, PRP, IN, JJR, NNS, NN, NNP}",1
had,7743,3,0.010454,6.579754,VBD,VB,4,"{JJ, NN, RB, VB}",10,"{VBN, VBZ, VBD, VBP, NNS, VB, RB, JJ, NN, NNP}",1
against,543,7,0.000733,10.413622,IN,IN,3,"{NN, IN, VB}",7,"{VBN, IN, VBP, NNS, VB, NN, NNP}",1
will,1947,4,0.002629,8.571393,MD,MD,8,"{WP, IN, MD, VB, RB, WD, JJ, NN}",13,"{WDT, VBZ, VBD, WP, VBP, IN, NNS, MD, VB, RB, ...",1
him,4161,3,0.005618,7.475716,PRP,PR,8,"{CD, VB, PR, RB, RP, JJ, PD, NN}",16,"{VBN, VBZ, VBD, PRP, PDT, RBR, VBP, JJR, NNS, ...",1
ourselves,56,9,7.6e-05,13.691076,PRP,NN,7,"{IN, MD, VB, PR, JJ, RP, NN}",9,"{VBZ, PRP, IN, NNS, MD, VB, JJ, RP, NN}",1
yours,82,5,0.000111,13.140878,NN,NN,6,"{CD, PR, RB, VB, JJ, NN}",9,"{VBZ, PRP, NNS, CD, VB, RB, JJ, NN, NNP}",1
to,22828,2,0.030822,5.019914,TO,TO,8,"{FW, TO, IN, VB, RB, JJ, RP, NN}",10,"{FW, TO, IN, NNS, RB, VB, JJ, RP, NN, NNP}",1


In [50]:
# stemming

from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

In [51]:
VOCAB.sample(10)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
stared,20,6,2.7e-05,15.176502,VBD,VB,1,{VB},2,"{VBN, VBD}",0,stare,stare,star
chapbut,1,7,1e-06,19.49843,NN,NN,1,{NN},1,{NN},0,chapbut,chapbut,chapbut
cushioned,1,9,1e-06,19.49843,JJ,JJ,1,{JJ},1,{JJ},0,cushion,cushion,cush
orthoptera,2,10,3e-06,18.49843,NN,NN,1,{NN},1,{NN},0,orthoptera,orthoptera,orthopter
festoons,1,8,1e-06,19.49843,NNS,NN,1,{NN},1,{NNS},0,festoon,festoon,festoon
allthan,1,7,1e-06,19.49843,RP,RP,1,{RP},1,{RP},0,allthan,allthan,allth
contrived,6,9,8e-06,16.913468,VBN,VB,1,{VB},1,{VBN},0,contriv,contriv,cont
meperhaps,1,9,1e-06,19.49843,VB,VB,1,{VB},1,{VB},0,meperhap,meperhap,meperhap
ceiled,1,6,1e-06,19.49843,JJ,JJ,1,{JJ},1,{JJ},0,ceil,ceil,ceil
unforeseen,3,10,4e-06,17.913468,JJ,JJ,1,{JJ},1,{JJ},0,unforeseen,unforeseen,unforeseen


I chose to include the Snowball and Lancaster stems as well, since they will be used in just a moment answering questions 4 and 5. But they were not explicitly required as part of the VOCAB table. I believe that my LIB, CORPUS, and VOCAB tables all have the required features.

## Part 2: Answering the Questions

### Question 1: *Middlemarch* chunking pattern

In [53]:
LIB.loc[145, "chap_regex"]

'^(?:PRELUDE|CHAPTER|FINALE)'

All the chapter headers were in all caps, and there was no Table of Contents, so this was very simple. I used as a template the pattern used for Melville's *Moby Dick*. I probably could have just had all caps as my criteria, but this is more specific.

### Question 2: Title of Book with Most Tokens

In [55]:
LIB.sort_values('book_len', ascending=False)

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
145,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",MIDDLEMARCH,^(?:PRELUDE|CHAPTER|FINALE),317805,88
507,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",ADAM BEDE,^(?:Chapter\s+[IVXLCM]+|Epilogue)\.?\s*$,215403,56
6688,/home/cwp5xyj/Documents/MSDS/DS5001/data/eliot...,"ELIOT, GEORGE",THE MILL ON THE FLOSS,^\s*Chapter\s+[IVXLCM]+\.?\s*$,207461,58


In [65]:
# to be fancy
LIB.sort_values('book_len', ascending=False).head(1).title.values[0]

'MIDDLEMARCH'

The book with the most tokens is *Middlemarch*, which makes sense because it also has the most chapters.

### Question 3: How many chapters in *Middlemarch*

In [64]:
LIB.sort_values('book_len', ascending=False).head(1).n_chaps.values[0]

88

*Middlemarch* has 88 chapters. Or rather, 86 with a "Prelude" and a "Finale", for 88 chapter-level chunks.

### Question 4: Which Stemmer is Most Aggressive?

In [69]:
# this is gonna involve a groupby right

len(VOCAB.groupby('stem_porter'))

17546

In [70]:
len(VOCAB.groupby('stem_snowball'))

17209

In [71]:
len(VOCAB.groupby('stem_lancaster'))

14618

In [73]:
# just for context
len(VOCAB)

26351

In [76]:
VOCAB.sample(20)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
hanged,12,6,1.6e-05,15.913468,VBN,VB,3,"{JJ, NN, VB}",3,"{VBN, JJ, NN}",0,hang,hang,hang
electioneering,3,14,4e-06,17.913468,VBG,VB,2,"{NN, VB}",2,"{VBG, NN}",0,election,election,elect
forsaken,19,8,2.6e-05,15.250503,VBN,VB,3,"{JJ, NN, VB}",4,"{VBN, JJ, NN, VBZ}",0,forsaken,forsaken,forsak
clapping,5,8,7e-06,17.176502,VBG,VB,1,{VB},1,{VBG},0,clap,clap,clap
hassock,1,7,1e-06,19.49843,NN,NN,1,{NN},1,{NN},0,hassock,hassock,hassock
worritin,1,8,1e-06,19.49843,NN,NN,1,{NN},1,{NN},0,worritin,worritin,worritin
wastes,1,6,1e-06,19.49843,VBZ,VB,1,{VB},1,{VBZ},0,wast,wast,wast
distributes,1,11,1e-06,19.49843,VBZ,VB,1,{VB},1,{VBZ},0,distribut,distribut,distribut
motherknew,1,10,1e-06,19.49843,FW,FW,1,{FW},1,{FW},0,motherknew,motherknew,motherknew
davy,4,4,5e-06,17.49843,NNP,NN,1,{NN},1,{NNP},0,davi,davi,davy


The most aggressive stemmer is the Lancaster stemmer, which makes sense, because the above visual inspection of the DataFrame shows that Lancaster seems to generally have shorter stems.

### Question 5: Lancaster stem with most associated terms

I am operating under the assumption that this question wants the number of different terms, not the total number of terms in the corpus. Would probably be like "and" or something otherwise. Not as interesting.

In [89]:
VOCAB['stem_lancaster'].value_counts().head(1)

stem_lancaster
cont    34
Name: count, dtype: int64

The Lancaster stem with the most associated terms is "cont", with 34 different associated terms.