In [165]:
import os
import re
import random
import spacy
from ebooklib import epub
from bs4 import BeautifulSoup

In [93]:
def list_dir_recursive(d, files = []):
    if os.path.isdir(d):
        for f in os.listdir(d):
            path = os.path.join(d, f)
            list_dir_recursive(path, files)
    files.append(d)
    return files

In [170]:
files = list_dir_recursive("../data/jk_rowling/")
books = [f for f in files if re.search("epub$", f)]

In [171]:
class EBook(object):
    
    def __init__(self, path):
        self._path = path
        self._epub = epub.read_epub(path)
        self._paragraphs = None
        
    def _docs(self):
        for doc in self._epub.get_items():
            if doc.get_type() == 9: # these appear to be the html portions of the ebook
                yield doc
                
    @property
    def n_paragraphs(self):
        return len(self.paragraphs)
        
    @property
    def paragraphs(self):
        if self._paragraphs is None:
            ps = []
            for doc in self._docs():
                html = doc.content
                bs = BeautifulSoup(html)
                for item in bs.find_all("p"):
                    ps.append(item.text)
            self._paragraphs = ps
        return self._paragraphs
    
    def sample(self, n=5):
        i = random.randint(0, self.n_paragraphs - (n+1))
        return "\n\n".join(self.paragraphs[i:(i+n)])
    

In [172]:
e = EBook(books[4])
paragraphs = e.paragraphs

In [173]:
print e.sample(15)

Harry and Sirius were both laughing. Mundungus, who had toppled backward off his chair, was swearing as he got to his feet. Crookshanks had given an angry hiss and shot off under the dresser, from whence his large yellow eyes glowed in the darkness.

“Boys,” Mr. Weasley said, lifting the stew back into the middle of the table, “your mother’s right, you’re supposed to show a sense of responsibility now you’ve come of age —”

“— none of your brothers caused this sort of trouble!” Mrs. Weasley raged at the twins, slamming a fresh flagon of butterbeer onto the table and spilling almost as much again. “Bill didn’t feel the need to Apparate every few feet! Charlie didn’t Charm everything he met! Percy —”

She stopped dead, catching her breath with a frightened look at her husband, whose expression was suddenly wooden.

“Let’s eat,” said Bill quickly.

“It looks wonderful, Molly,” said Lupin, ladling stew onto a plate for her and handing it across the table.

For a few minutes there was silen

In [216]:
nlp = spacy.load("en_core_web_md")

In [175]:
doc = nlp(e.sample(10))

In [176]:
doc.tensor.shape

(324, 384)

In [213]:
v = nlp.vocab

In [218]:
v = nlp.vocab.vectors