In [None]:
!pip install spacy==2.0.12 # Above 2.0.13 doesn't work with the neuralcoref resolution
# !pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz # This is the coref language model
!pip install networkx
!pip install pydot # To draw our graphs in graphviz
!pip install graphviz

In [None]:
import spacy
from spacy import displacy
from collections import Counter
import re
import os
import pandas as pd
import networkx as nx
import sys
import pydot
import matplotlib.pyplot as plt
import graphviz

# Utils
from tqdm import tqdm
from collections import defaultdict

# NLTK Stuff
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.data import load as nltk_load
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer

In [None]:
HP1_PATH = '../data/www.glozman.com/harry_potter_1_sorcerer_s_stone.txt'

text_file = open(HP1_PATH, mode='r', encoding='utf-8')
text = text_file.read()
text_file.close()

# text = re.sub(r'(?:[A-Z]{2,}\s+)', '', text)
# text = text[39:]

chapters = re.split(r"CHAPTER [A-Z]*[\n\r\s]*[A-Z\s]*[\n\r]", text)

print(text[0:500])
print('\n...\n')
print(text[-500:])
print('Length: {}'.format(len(text)))

In [None]:
! python -m spacy download en_core_web_lg

In [None]:
tokens = word_tokenize(text)
print(tokens[0:100])

In [None]:
def sentence_tokenize(text):
    """
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    tokenizer = nltk_load('../nltk_data/tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(text)


sentences = sentence_tokenize(re.sub(r'\s+', ' ', text))
print('\n\n'.join(sentences[:3]))

In [None]:
# Use that file to process the text into a doc.
nlp = spacy.load('en_core_web_lg')

In [None]:
docs = [nlp(sentence) for sentence in tqdm(sentences)]
print(docs[0])

In [None]:
import csv

In [None]:
with open('entities.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['token', 'label', 'start_char', 'end_char', 'context'])
    for doc in docs:
        for ent in doc.ents:
            writer.writerow([ent.text, str(ent.label_), str(ent.start_char), str(ent.end_char), str(doc)])

In [None]:
entities = pd.read_csv('entities.csv')

In [None]:
entities.head()

In [None]:
entities[entities.token == 'Hagrid'].head()

In [None]:
counts = defaultdict(lambda: defaultdict(lambda: 0))
for index, row in entities.iterrows():
    counts[row.token][row.label] += 1
    counts[row.token]['TOTAL'] += 1

In [None]:
with open('entity_counts.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['token', 'entity', 'count', 'percentage'])
    for token, entity_counts in counts.items():
        for entity, count in entity_counts.items():
            if entity == 'TOTAL': continue
            writer.writerow([token, entity, str(count), '{:0.1f}%'.format(count / entity_counts['TOTAL'] * 100)])

In [None]:
entity_counts = pd.read_csv('entity_counts.csv')
entity_counts.head()

I want to show all tokens with more than a count of 2 and show them sorted by total count, but only showing the entity that has the highest count. For tie breaking, it can just use the first.

In [None]:
# token => total count
highest_counts = defaultdict(lambda: defaultdict(lambda: 0))
for index, row in entity_counts.iterrows():
    if highest_counts[row.token]['max'] >= row['count']:
        continue
    highest_counts[row.token]['max'] = row['count']
    highest_counts[row.token]['entity'] = row.entity
    highest_counts[row.token]['total'] = counts[row.token]['TOTAL']
    highest_counts[row.token]['winning_percentage'] = row['count'] / counts[row.token]['TOTAL']


with open('max_counts.csv', 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['token', 'entity', 'total', 'winning_percentage'])
    for token, data in highest_counts.items():
        writer.writerow([token, data['entity'], data['total'], '{:0.1f}%'.format(data['winning_percentage'] * 100)])

In [None]:
max_counts = pd.read_csv('max_counts.csv')
max_counts.head()

In [None]:
sorted_max_counts = max_counts.sort_values(by=['total'], ascending=False)
display(sorted_max_counts.count())
display(sorted_max_counts.head(n=40))

In [None]:
print('Tokens with count > 2: {}'.format(len(sorted_max_counts[sorted_max_counts.total > 2])))

Even in the first 20, there are errors: "Malfoy" and "Quirrell" are labeled `ORG` for the majority of cases. "Gryffindor" is labeled as a person most of the time, when it would actually probably more appropriately labeled `ORG`. Hogwarts is labeled `PERSON`, but is definitely more of an `ORG`. Quidditch is labeled `PERSON`, but should be 