In [3]:
import codecs
import os
import spacy
import json
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from pathlib import Path
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
def common_words(path):
    with codecs.open(path) as f:
        words = f.read()
        words = json.loads(words)

    return set(words)

def read_novel(path):
    f =  codecs.open(path, 'r', encoding='utf-8', errors='ignore')  
    data = f.read().replace('\r', ' ').replace('\n', ' ').replace("\'", "'")
    return data

def flatten(input_list):
    flat_list = []
    for i in input_list:
        if type(i) == list:
            flat_list += flatten(i)
        else:
            flat_list += [i]

    return flat_list


def name_entity_recognition(sentence):
    doc = nlp(sentence)
    # retrieve person and organization's name from the sentence
    name_entity = [x for x in doc.ents if x.label_ in ['PERSON', 'ORG']]
    # convert all names to lowercase and remove 's in names
    name_entity = [str(x).lower().replace("'s","") for x in name_entity]
    # split names into single words ('Harry Potter' -> ['Harry', 'Potter'])
    #name_entity = [x.split(' ') for x in name_entity]
    # flatten the name list
    name_entity = flatten(name_entity)
    # remove name words that are less than 3 letters to raise recognition accuracy
    name_entity = [x for x in name_entity if len(x) >= 3]
    # remove name words that are in the set of 4000 common words
    name_entity = [x for x in name_entity if x not in words]

    return name_entity


def iterative_NER(sentence_list, threshold_rate=0.0005):
    '''
    A function to execute the name entity recognition function iteratively. The purpose of this
    function is to recognise all the important names while reducing recognition errors.
    :param sentence_list: the list of sentences from the novel
    :param threshold_rate: the per sentence frequency threshold, if a word's frequency is lower than this
    threshold, it would be removed from the list because there might be recognition errors.
    :return: a non-duplicate list of names in the novel.
    '''

    output = []
    for i in sentence_list:
        name_list = name_entity_recognition(i)
        if name_list != []:
            output.append(name_list)
    output = flatten(output)
    from collections import Counter
    output = Counter(output)
    output = [x for x in output if output[x] >= threshold_rate * len(sentence_list)]

    return output


def top_names(name_list, novel, top_num=20):
    '''
    A function to return the top names in a novel and their frequencies.
    :param name_list: the non-duplicate list of names of a novel.
    :param novel: the novel text.
    :param top_num: the number of names the function finally output.
    :return: the list of top names and the list of top names' frequency.
    '''

    vect = CountVectorizer(vocabulary=name_list, stop_words='english')
    name_frequency = vect.fit_transform([novel.lower()])
    name_frequency = pd.DataFrame(name_frequency.toarray(), columns=vect.get_feature_names())
    name_frequency = name_frequency.T
    name_frequency = name_frequency.sort_values(by=0, ascending=False)
    name_frequency = name_frequency[0:top_num]
    names = list(name_frequency.index)
    name_frequency = list(name_frequency[0])

    return name_frequency, names


nlp = spacy.load('en_core_web_sm')
words = common_words('common_words.txt')

In [49]:
novel_folder = Path(os.getcwd()) / 'novels' / 'Harry Potter 1.txt'
novel = read_novel(novel_folder)
print(novel[:1000])

Harry Potter and the Sorcerer's Stone  CHAPTER ONE  THE BOY WHO LIVED  Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.  Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.  The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs

In [40]:
sentence_list = sent_tokenize(novel)
preliminary_name_list = iterative_NER(sentence_list)
print(preliminary_name_list)

['harry potter', 'dursley', 'dursleys', 'dudley', 'potters', 'potter', 'harry', 'albus dumbledore', 'mcgonagall', 'dedalus diggle', 'dumbledore', 'voldemort', 'madam pomfrey', 'professor mcgonagall', 'hagrid', 'uncle vernon', 'vernon', 'figg', 'marge', 'h. potter', 'yeh', 'hogwarts', 'don', 'knuts', 'nah', 'gringotts', 'the leaky cauldron', 'quirrell', 'diagon alley', 'griphook', 'madam malkin', 'quidditch', 'hufflepuff', 'ollivander', 'king cross', 'hedwig', 'fred', 'ron', 'neville', 'percy', 'lee jordan', 'george weasley', 'george', 'charlie', 'chocolate frogs', 'nicolas flamel', 'hermione granger', 'hermione', 'ravenclaw', 'goyle', 'malfoy', 'draco malfoy', 'weasleys', 'gryffindor', 'seamus', 'slytherin', 'weasley', 'nearly headless nick', 'seamus finnigan', 'gryffindors', 'snape', 'filch', 'madam hooch', 'peeves', 'norris', 'sprout', 'flitwick', 'malfoy, crabbe', 'fang', 'slytherins', 'dean', 'longbottom', 'oliver wood', 'bludgers', 'snitch', 'granger', 'fred weasley', 'marcus flin

In [41]:
name_frequency, name_list = top_names(preliminary_name_list, novel, 100)
print(name_list)

['harry', 'ron', 'hagrid', 'hermione', 'don', 'snape', 'dumbledore', 'dudley', 'malfoy', 'yeh', 'neville', 'vernon', 'quirrell', 'potter', 'mcgonagall', 'gryffindor', 'hogwarts', 'quidditch', 'dursley', 'filch', 'slytherin', 'dursleys', 'weasley', 'percy', 'voldemort', 'fred', 'peeves', 'george', 'gringotts', 'flamel', 'fluffy', 'norbert', 'granger', 'goyle', 'charlie', 'ronan', 'flitwick', 'seamus', 'firenze', 'fang', 'ollivander', 'snitch', 'hufflepuff', 'sorcerer', 'bane', 'gryffindors', 'slytherins', 'norris', 'bludgers', 'hedwig', 'griphook', 'weasleys', 'dean', 'potters', 'ravenclaw', 'longbottom', 'figg', 'nah', 'knuts', 'marge', 'sprout', 'oliver wood', 'the ministry of magic', 'albus dumbledore', 'madam pomfrey', 'dedalus diggle', 'professor mcgonagall', 'marcus flint', 'fred weasley', 'madam hooch', 'malfoy, crabbe', 'uncle vernon', 'lee jordan', 'h. potter', 'seamus finnigan', 'nearly headless nick', 'the leaky cauldron', 'diagon alley', 'draco malfoy', 'madam malkin', 'herm

In [38]:
for sentence in sentence_list[:100]:
    names = name_entity_recognition(sentence)
    print(names)

['harry potter', 'lived', 'dursley']
[]
['dursley', 'grunnings']
[]
['dursley']
['dursleys', 'dudley']
['dursleys']
['potters']
['potter', 'dursley', 'dursley']
['dursleys', 'potters']
['dursleys', 'potters']
['dudley']
['dursley']
['dursley', 'dursley', 'dudley']
[]
['dursley', 'dursley', 'dudley', 'dudley']
['dursley']
[]
[]
['dursley']
[]
[]
[]
['dursley']
[]
['dursley']
[]
['dursley']
[]
[]
[]
[]
['dursley']
[]
[]
[]
['dursley']
[]
['dursley']
['dursley', 'grunnings']
['dursley']
[]
[]
[]
['dursley']
[]
[]
[]
['baker']
[]
[]
[]
[]
['harry', 'dursley']
[]
[]
[]
[]
[]
['harry']
['harry']
[]
[]
[]
['dursley']
[]
[]
[]
['dursley']
[]
[]
[]
[]
['dursley']
['dursley']
[]
[]
[]
[]
[]
[]
[]
[]
['dursley']
[]
[]
[]
['dursley']
[]
[]
['dursley']
['dudley']
['dursley']
['dudley']
[]
[]
[]
[]
['jim mcguffin']
['jim']


In [55]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
 
 
from tqdm import tqdm
import re
import string
from itertools import combinations
from collections import Counter
 
 
from flair.models import SequenceTagger
from flair.data import Sentence

# Use flair named entity recognition
tagger = SequenceTagger.load('ner')
 

2022-01-31 11:47:46,162 --------------------------------------------------------------------------------
2022-01-31 11:47:46,162 The model key 'ner' now maps to 'https://huggingface.co/flair/ner-english' on the HuggingFace ModelHub
2022-01-31 11:47:46,163  - The most current version of the model is automatically downloaded from there.
2022-01-31 11:47:46,163  - (you can alternatively manually download the original model at https://nlp.informatik.hu-berlin.de/resources/models/ner/en-ner-conll03-v0.4.pt)
2022-01-31 11:47:46,164 --------------------------------------------------------------------------------


[nltk_data] Downloading package stopwords to /Users/Xuan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Xuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2022-01-31 11:47:46,503 loading file /Users/Xuan/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4


In [56]:

# Get all the names of entities tagged as people
x = []
 
for line in tqdm(sentence_list):
#for line in sentence_list:
 sentence = Sentence(line)
 tagger.predict(sentence)
 for entity in sentence.to_dict(tag_type='ner')['entities']:
   #if entity['type'] == 'PER':
    x.append(entity['text'])

In [57]:
# Remove any punctuation within the names
names = []
for name in x:
  names.append(name.translate(str.maketrans('', '', string.punctuation)))

# List characters by the frequency with which they are mentioned
result = [item for items, c in Counter(x).most_common() 
                                      for item in [items] * c] 

print(Counter(names).most_common())

[('Harry', 1286), ('Ron', 426), ('Hagrid', 365), ('Hermione', 253), ('Snape', 169), ('Dumbledore', 145), ('Dudley', 135), ('Neville', 115), ('Malfoy', 114), ('Quirrell', 111), ('Uncle Vernon', 108), ('McGonagall', 95), ('Gryffindor', 76), ('Hogwarts', 67), ('Potter', 59), ('Aunt Petunia', 51), ('Dursleys', 50), ('Dursley', 49), ('Filch', 49), ('Wood', 48), ('Quidditch', 47), ('Slytherin', 43), ('Stone', 43), ('Voldemort', 36), ('Percy', 36), ('Peeves', 32), ('Fred', 29), ('Harry Potter', 28), ('Norbert', 26), ('Gringotts', 25), ('Goyle', 24), ('Weasley', 24), ('George', 21), ('Charlie', 20), ('Crabbe', 20), ('Ronan', 20), ('Quaffle', 19), ('Firenze', 19), ('Muggles', 18), ('Fang', 18), ('Fluffy', 18), ('Muggle', 17), ('Ollivander', 17), ('Hermione Granger', 17), ('Flitwick', 16), ('Bane', 16), ('Great Hall', 15), ('Flamel', 15), ('Privet Drive', 14), ('Hufflepuff', 14), ('Seamus', 14), ('Madam Pomfrey', 13), ('Piers', 13), ('London', 13), ('Nimbus Two Thousand', 13), ('Hedwig', 13), ('