In [None]:
!pip install spacy==2.0.13 # Above 2.0.13 doesn't work with the neuralcoref resolution
!pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz # This is the coref language model
!pip install networkx
!pip install pydot # To draw our graphs in graphviz
!pip install graphviz

In [4]:
import spacy
from spacy import displacy
from collections import Counter
import re
import os
import pandas as pd
import networkx as nx
import sys
import pydot
import matplotlib.pyplot as plt
import graphviz

# Utils
from tqdm import tqdm
from collections import defaultdict

# NLTK Stuff
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.data import load as nltk_load
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [31]:
ROOT_DIR = '../../..'

HP1_PATH = ROOT_DIR + '/data/www.glozman.com/harry_potter_1_sorcerer_s_stone.txt'

text_file = open(HP1_PATH, mode='r', encoding='utf-8')
text = text_file.read()
text_file.close()

# text = re.sub(r'(?:[A-Z]{2,}\s+)', '', text)
# text = text[39:]

chapters = re.split(r"CHAPTER [A-Z]*[\n\r\s]*[A-Z\s]*[\n\r]", text)

print(text[0:500])
print('\n...\n')
print(text[-500:])
print('Length: {}'.format(len(text)))

Harry Potter and the Sorcerer's Stone

CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache.

...

boy, we haven't got all day." He walked away.

Harry hung back for a last word with Ron and Hermione.

"See you over the summer, then."

"Hope you have -- er -- a good holiday," said Hermione, looking uncertainly after Uncle Vernon, shocked that anyone could be so unpleasant.

"Oh, I will," said Harry, and they were surprised at the grin that was spreading over his face. "They don't know we're not allowed to use magic at home. I'm going to have a lot of fun with Dudley this summer...."

T

In [6]:
! python -m spacy download en_core_web_md


[93m    Linking successful[0m
    /root/anaconda3/lib/python3.6/site-packages/en_core_web_md -->
    /root/anaconda3/lib/python3.6/site-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')



In [7]:
tokens = word_tokenize(text)
print(tokens[0:100])

['Harry', 'Potter', 'and', 'the', 'Sorcerer', "'s", 'Stone', 'CHAPTER', 'ONE', 'THE', 'BOY', 'WHO', 'LIVED', 'Mr.', 'and', 'Mrs.', 'Dursley', ',', 'of', 'number', 'four', ',', 'Privet', 'Drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'They', 'were', 'the', 'last', 'people', 'you', "'d", 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', ',', 'because', 'they', 'just', 'did', "n't", 'hold', 'with', 'such', 'nonsense', '.', 'Mr.', 'Dursley', 'was', 'the', 'director', 'of', 'a', 'firm', 'called', 'Grunnings', ',', 'which', 'made', 'drills', '.', 'He', 'was', 'a', 'big', ',', 'beefy', 'man', 'with', 'hardly', 'any', 'neck', ',', 'although', 'he', 'did', 'have', 'a', 'very']


In [8]:
def sentence_tokenize(text):
    """
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    tokenizer = nltk_load('../nltk_data/tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(text)


sentences = sentence_tokenize(re.sub(r'\s+', ' ', text))
print('\n\n'.join(sentences[:3]))

Harry Potter and the Sorcerer's Stone CHAPTER ONE THE BOY WHO LIVED Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.

They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made drills.


In [9]:
# Use that file to process the text into a doc.
nlp = spacy.load('en_core_web_md')

In [10]:
docs = [nlp(sentence) for sentence in tqdm(sentences)]
print(docs[0])

100%|██████████| 6394/6394 [01:07<00:00, 83.72it/s] 

Harry Potter and the Sorcerer's Stone CHAPTER ONE THE BOY WHO LIVED Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.





In [11]:
import csv

In [12]:
with open('entities.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['token', 'label', 'start_char', 'end_char', 'context'])
    for doc in docs:
        for ent in doc.ents:
            writer.writerow([ent.text, str(ent.label_), str(ent.start_char), str(ent.end_char), str(doc)])

In [13]:
entities = pd.read_csv('entities.csv')

In [14]:
entities.head()

Unnamed: 0,token,label,start_char,end_char,context
0,Harry Potter,PERSON,0,12,Harry Potter and the Sorcerer's Stone CHAPTER ...
1,the Sorcerer's Stone CHAPTER,ORG,17,45,Harry Potter and the Sorcerer's Stone CHAPTER ...
2,ONE,CARDINAL,46,49,Harry Potter and the Sorcerer's Stone CHAPTER ...
3,Dursley,PERSON,81,88,Harry Potter and the Sorcerer's Stone CHAPTER ...
4,four,CARDINAL,100,104,Harry Potter and the Sorcerer's Stone CHAPTER ...


In [15]:
entities[entities.token == 'Hagrid'].head()

Unnamed: 0,token,label,start_char,end_char,context
200,Hagrid,PERSON,95,101,"It must have made sense to Dumbledore, though,..."
220,Hagrid,PERSON,1,7,"""Hagrid's bringing him."""
221,Hagrid,PERSON,34,40,"""You think it -- wise -- to trust Hagrid with ..."
222,Hagrid,PERSON,14,20,"I would trust Hagrid with my life,"" said Dumbl..."
226,Hagrid,PERSON,1,7,"""Hagrid,"" said Dumbledore, sounding relieved."


In [16]:
counts = defaultdict(lambda: defaultdict(lambda: 0))
for index, row in entities.iterrows():
    counts[row.token][row.label] += 1
    counts[row.token]['TOTAL'] += 1

In [17]:
with open('entity_counts.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['token', 'entity', 'count', 'percentage'])
    for token, entity_counts in counts.items():
        for entity, count in entity_counts.items():
            if entity == 'TOTAL': continue
            writer.writerow([token, entity, str(count), '{:0.1f}%'.format(count / entity_counts['TOTAL'] * 100)])

In [18]:
entity_counts = pd.read_csv('entity_counts.csv')
entity_counts.head()

Unnamed: 0,token,entity,count,percentage
0,Harry Potter,PERSON,17,65.4%
1,Harry Potter,WORK_OF_ART,9,34.6%
2,the Sorcerer's Stone CHAPTER,ORG,1,100.0%
3,ONE,CARDINAL,1,100.0%
4,Dursley,PERSON,52,100.0%


I want to show all tokens with more than a count of 2 and show them sorted by total count, but only showing the entity that has the highest count. For tie breaking, it can just use the first.

In [19]:
# token => total count
highest_counts = defaultdict(lambda: defaultdict(lambda: 0))
for index, row in entity_counts.iterrows():
    if highest_counts[row.token]['max'] >= row['count']:
        continue
    highest_counts[row.token]['max'] = row['count']
    highest_counts[row.token]['entity'] = row.entity
    highest_counts[row.token]['total'] = counts[row.token]['TOTAL']
    highest_counts[row.token]['winning_percentage'] = row['count'] / counts[row.token]['TOTAL']


with open('max_counts.csv', 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['token', 'entity', 'total', 'winning_percentage'])
    for token, data in highest_counts.items():
        writer.writerow([token, data['entity'], data['total'], '{:0.1f}%'.format(data['winning_percentage'] * 100)])

In [20]:
max_counts = pd.read_csv('max_counts.csv')
max_counts.head()

Unnamed: 0,token,entity,total,winning_percentage
0,Harry Potter,PERSON,26,65.4%
1,the Sorcerer's Stone CHAPTER,ORG,1,100.0%
2,ONE,CARDINAL,1,100.0%
3,Dursley,PERSON,52,100.0%
4,four,CARDINAL,26,100.0%


In [21]:
sorted_max_counts = max_counts.sort_values(by=['total'], ascending=False)
display(sorted_max_counts.count())
display(sorted_max_counts.head(n=20))

token                 984
entity                984
total                 984
winning_percentage    984
dtype: int64

Unnamed: 0,token,entity,total,winning_percentage
22,Harry,PERSON,1284,99.1%
393,Ron,PERSON,426,100.0%
74,Hagrid,PERSON,363,74.7%
447,Hermione,PERSON,252,99.2%
533,Snape,PERSON,170,100.0%
8,Dudley,PERSON,136,75.0%
61,Dumbledore,PERSON,122,92.6%
397,Neville,PERSON,115,60.0%
457,Malfoy,ORG,112,92.9%
302,Quirrell,ORG,111,82.0%


In [22]:
print('Tokens with count > 2: {}'.format(len(sorted_max_counts[sorted_max_counts.total > 2])))

Tokens with count > 2: 214


Even in the first 20, there are errors: "Malfoy" and "Quirrell" are labeled `ORG` for the majority of cases. "Gryffindor" is labeled as a person most of the time, when it would actually probably more appropriately labeled `ORG`. Hogwarts is labeled `PERSON`, but is definitely more of an `ORG`. Quidditch is labeled `PERSON`, but should be 