In [1]:
import spacy
import re
import en_core_web_sm
import en_core_web_lg
import math

In [2]:
nlp_sm = en_core_web_sm.load()
nlp_sm.max_length = 2500000

nlp_lg = en_core_web_lg.load()
nlp_lg.max_length = 2500000

In [8]:
doc = nlp_sm(u"Apple is looking at buying U.K. startup for $1 billion. Bran Stark is a boy.")
for token in doc:
   print(token.lemma_, token.text) 
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple Apple
be is
look looking
at at
buy buying
U.K. U.K.
startup startup
for for
$ $
1 1
billion billion
. .
Bran Bran
Stark Stark
be is
a a
boy boy
. .
Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY
Stark 61 66 PERSON


In [22]:
def union_entities(listA, listB):
    result = listA
    for ent_b,y1,y2 in listB:
        overlap = False
        for ent_a,x1,x2 in listA:
            if x1 <= y2 and y1 <= x2: # overlap
                overlap = True
                if (y2-y1) > (x2-x2): # the largest entity is B
                    result.remove((ent_a,x1,x2))
                    result.append((ent_b,y1,y2))
                break
        if not overlap:
            result.append((ent_b,y1,y2))
    return result


def process_chapter(chapter):
    doc1 = nlp_sm(chapter)
    doc2 = nlp_lg(chapter)
    sm = []
    lg = []
    for ent in doc1.ents:
        if ent.label_ == 'PERSON':
            sm.append((ent.text, ent.start_char, ent.end_char))
    for ent in doc2.ents:
        if ent.label_ == 'PERSON':
            lg.append((ent.text, ent.start_char, ent.end_char))

    return union_entities(sm,lg)


def process_book(head, tail, book_number):
    filename = head + str(book_number) + tail
    with open(filename) as fp:
        book = fp.read()
    regex = r"\n{1,}([A-Z\s]+)\n{2,}"
    chunks = re.split(regex,book)
    
    entities = {}
    for index, chunk in enumerate(chunks):
        if chunk.isupper():
            #print(chunk.strip().title(),chunks[index+1][:100].strip())
            title = str(book_number) + '–' + str(math.floor(index/2)) + '-' + chunk.strip().title()
            chapter = chunks[index+1].strip().replace('\t','').replace('"','').replace('\n',' ').replace(' . . .','...')
            entities[title] = process_chapter(chapter)
    return entities

def process_book_for_stats(head, tail, book_number):
    paragraphs = []
    sentences = []
    
    filename = head + str(book_number) + tail
    with open(filename) as fp:
        book = fp.read()
    regex = r"\n{1,}([A-Z\s]+)\n{2,}"
    chunks = re.split(regex,book)
    for chunk in chunks:
        if not chunk.isupper():
            paragraphs += [len(l) for l in chunk.split('\n')]
            sentences += [len(l) for l in chunk.split('.')]
    return paragraphs,sentences

In [5]:
characters = {} #––––––––––––––––––––––––––––––––
for i in range (1,6):
    characters.update(process_book('books/GOT','.txt',i))
    print('book',i,'done')

book 1 done
book 2 done
book 3 done
book 4 done
book 5 done


In [7]:
import pickle
with open ('GOT-characters-raw.pickle','wb+') as blavlaz:
    pickle.dump(characters, blavlaz)

In [23]:
paragraphs = []
sentences = []
for i in range (1,6):
    p, s = process_book_for_stats('books/GOT','.txt',i)
    paragraphs+= p
    sentences += s
    print('book',i,'done')

book 1 done
book 2 done
book 3 done
book 4 done
book 5 done


In [25]:
import numpy as np
print(np.average(paragraphs)+np.)
print(np.average(sentences))

235.97917504019293
60.226964439857994
