## Get Data from Book

In [1]:
import re

from nltk import tokenize
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup

from itertools import combinations, chain

In [2]:
def get_section_header(text):
    return text.strip().split("\n")[0]

In [3]:
book_choice = '../books/The Overstory - Richard Powers.epub'

In [4]:
book = epub.read_epub(book_choice)
items = book.get_items_of_type(ebooklib.ITEM_DOCUMENT)

d = {}
for item in items:
    name = item.get_name()
    id_ = item.get_id()
    
    content = item.get_body_content()
    soup = BeautifulSoup(content, features="lxml")
    text = soup.get_text()
    
    header = get_section_header(text)
    
    d[id_] = dict(
        section_name=name,
        section_id=id_,
        text=text,
        header=header,
    )

In [5]:
d.keys()

dict_keys(['Cover', 'toc', 'title', 'ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8', 'ch9', 'ch10', 'ch11', 'ch12', 'also', 'ded', 'epi', 'copy'])

In [6]:
good_parts = {k: v for k, v in d.items() if k.startswith('ch')}

In [7]:
good_parts.keys()

dict_keys(['ch1', 'ch2', 'ch3', 'ch4', 'ch5', 'ch6', 'ch7', 'ch8', 'ch9', 'ch10', 'ch11', 'ch12'])

In [8]:
data = []
count = 0
for section in good_parts.values():
    
    # split text into lines
    text = section['text']
    lines = tokenize.sent_tokenize(text)
    lines = list(chain.from_iterable([x.split('\n') for x in lines]))
    lines = [x for x in lines if len(x) > 1]
    
    # form into data
    for line in lines:
        item = dict(
            id=count,
            text=line,
            section=section['section_id']
        )
        data.append(item)
        count += 1

In [9]:
def get_line(i):
    """Find tweet with given id"""
    try:
        return next(x for x in data if x["id"] == i)
    except StopIteration as e:
        print("Error: No tweet with that ID")
        return None
    
showlen = lambda data: print(f"Length: {len(data):,}")

In [10]:
get_line(0)

{'id': 0, 'text': 'ROOTS', 'section': 'ch1'}

In [11]:
showlen(data)

Length: 16,531


## Look for Stanzas

In [30]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.insert(0, '..')

from datetime import datetime
from multiprocessing import Pool
import os
import pickle

from tqdm.auto import tqdm, trange

from paradeller.dataprep import (
    tokenize,
    find_duplicates,
    filter_out_duplicates,
    filter_out_short,
    filter_out_oddballs,
    filter_out_oddballs_recursive,
    restructure_data,
    create_adj_list_by_word,
    create_adj_list_by_id
)
from paradeller.analysis import (
    find_matches,
    find_matches_for_start_pairs,
    get_stanzas,
    find_final_stanzas
)
from paradeller.postprocess import(
    stanza_sorter_maker
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
showlen(data)
print("\nCleaning up data...")

# remove too short
print("> Remove too short")
data = filter_out_short(data)
showlen(data)

# remove duplicate phrases
print("> Remove duplicate phrases")
duplicates = find_duplicates(data)
data = filter_out_duplicates(data, duplicates)
showlen(data)

# remove oddballs (too few matches)
print("> Recursively remove oddballs")
data = filter_out_oddballs_recursive(data)
showlen(data)

print("\nCreating adjacency lists...")
# make adj lists
adj_list_words, adj_list_ids = restructure_data(data)

stuff = {
    "data": data, "duplicates": duplicates, "adj_list_words": adj_list_words, "adj_list_ids": adj_list_ids
}

for k, v in stuff.items():
    print(f"{k:15} type: {type(v)}\tlen: {len(v):,}")

Length: 2,662

Cleaning up data...
> Remove too short


HBox(children=(IntProgress(value=0, max=2662), HTML(value='')))


Length: 2,662
> Remove duplicate phrases


HBox(children=(IntProgress(value=0, max=2662), HTML(value='')))


Length: 2,662
> Recursively remove oddballs


HBox(children=(IntProgress(value=0, max=2662), HTML(value='')))


Nothing removed. Done filtering.
Length: 2,662

Creating adjacency lists...


HBox(children=(IntProgress(value=0, max=2662), HTML(value='')))


data            type: <class 'list'>	len: 2,662
duplicates      type: <class 'dict'>	len: 2,662
adj_list_words  type: <class 'dict'>	len: 964
adj_list_ids    type: <class 'dict'>	len: 2,662


In [14]:
ids = list(adj_list_ids.keys())

#TMP
# ids = ids[:10]

pairs = list(combinations(ids, 2))
len(pairs)

3541791

In [15]:
# all_valid = find_matches_for_start_pairs(pairs, adj_list_ids, adj_list_words)

#### Look for stanzas

In [16]:
def find_matches_for_pair(p):
    return find_matches(p[0], p[1], adj_list_ids, adj_list_words)

In [17]:
with Pool(os.cpu_count()) as pool:
    res = list(tqdm(
        pool.imap(find_matches_for_pair, pairs),
        total=len(pairs)
    ))

HBox(children=(IntProgress(value=0, max=3541791), HTML(value='')))




In [18]:
# zip results with search pairs, filter out empty
all_valid = [x for x in list(zip(pairs, res)) if x[1]]
print("Found {} results.".format(len(all_valid)))

# get filename
d = datetime.utcnow()
filename = "../data/found_stanzas_book_{}.pickle".format(d.strftime("%Y-%m-%d-%H-%M"))

# save to file
with open(filename, "wb") as f:
    pickle.dump(all_valid, f)
print("stanzas saved to data/found_stanzas_book_[datetime].pickle")

Found 54 results.
stanzas saved to data/found_stanzas_book_[datetime].pickle


In [20]:
stanzas = get_stanzas(all_valid)

In [35]:
stanza_sorter = stanza_sorter_maker(adj_list_ids)
sorted_stanzas = sorted(stanzas, key=stanza_sorter)
view_stanzas = sorted_stanzas[:100]

for stanza in view_stanzas:
    print("~"*50)
    #print(stanza_sorter(stanza))
    for i in [0,0,1,1,2,3]:
        t = stanza[i]
        tweet = get_line(t)
        print(f"@{tweet['section']:20} {tweet['text']} ")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ch3                  “Who are they?” 
@ch3                  “Who are they?” 
@ch8                  where are we? 
@ch8                  where are we? 
@ch10                 “Where are they?” 
@ch8                  Who are we? 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ch3                  She doesn’t care. 
@ch3                  She doesn’t care. 
@ch10                 He doesn’t understand. 
@ch10                 He doesn’t understand. 
@ch12                 She doesn’t understand. 
@ch4                  He doesn’t care. 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ch4                  He doesn’t care. 
@ch4                  He doesn’t care. 
@ch12                 She doesn’t understand. 
@ch12                 She doesn’t understand. 
@ch10                 He doesn’t understand. 
@ch3                  She doesn’t care. 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ch8                  Who are we? 
@ch8       

#### Look for complete paradelles

In [22]:
def find_final_stanzas_helper(stanzas):
    find_final_stanzas(*stanzas, adj_list_ids, adj_list_words)

In [23]:
if len(all_valid) >= 0:
    stanzas = get_stanzas(all_valid)

    # find combos to chcek
    print("Finding combos to check...")
    all_combos = combinations(stanzas, 3)
    combos = [c for c in all_combos if len(set().union(*c)) == 12]

    # look for complete paradelles
    print("searching for complete paradelles within found stanzas")
    with Pool(os.cpu_count()) as pool:
        res = list(
            tqdm(pool.imap(find_final_stanzas_helper, combos), total=len(combos))
        )
    all_poems = [x for x in list(zip(combos, res)) if x[1]]
    print("Found {} poems.".format(len(all_poems)))
    
    if len(all_poems) > 0:
        # get filename
        filename = "../data/found_poems_book_{}.pickle".format(d.strftime("%Y-%m-%d-%H-%M"))

        # save to file
        with open(filename, "wb") as f:
            pickle.dump(all_valid, f)
        print("results saved to data/found_poems_book_[datetime].pickle")

else:
    print("No complete poems found :(")

Finding combos to check...
searching for complete paradelles within found stanzas


HBox(children=(IntProgress(value=0, max=15304), HTML(value='')))


Found 0 poems.
results saved to data/found_poems_book_[datetime].pickle
