# Harry Potter Network

### Import packages

In [None]:
# Generic Python
import os
import re
from tqdm import tqdm
import pandas as pd
from itertools import combinations
from collections import Counter

# viz
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, text

### NLP
# nltk
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# spacy
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

### SNA
# networkx
import networkx as nx
from networkx.algorithms import community as nx_comm

## NLP

### Import Data

In [None]:
# ingest data
harry_potter_1_path = r"../Data/nlp/texts/J. K. Rowling - Harry Potter 1 - Sorcerer's Stone.txt"

In [None]:
with open(harry_potter_1_path) as f:
    harry_potter = f.read()

### Clean text
Since using spaCy NER, will keep sentence structure and capitalization the same. Remove newline characters.

In [None]:
# remove newline characters
harry_potter_clean = harry_potter.replace('\n', ' ')

### Tokenize into sentences
Using NLTK's sentence tokenizer

In [None]:
# each element is a sentence in the text
hp_sentences = sent_tokenize(harry_potter_clean)

### Find characters using spaCy Named Entity Recognition (NER)

In [None]:
# example of spaCy NER
doc = nlp(hp_sentences[420])
displacy.render(doc, style="ent", jupyter=True)

In [None]:
# run NER over all sentences
sent_docs = [nlp(sentence) for sentence in tqdm(hp_sentences)]

In [None]:
# get list of entities and their type in each sentence
sent_ents = []
for doc in sent_docs:
    sent_ents.append([{e.text: e.label_} for e in doc.ents if e.label_ not in ('DATE', 'TIME', 'ORDINAL', 'CARDINAL')])
    

In [None]:
# compose a list of all entities
all_ents = []
for sent in sent_ents:
    for entity in sent:
        all_ents.append(entity)


In [None]:
# count entities
entity_text = [list(ent.keys())[0] for ent in all_ents]
ent_counts = pd.DataFrame(Counter(entity_text), index=[0]).transpose().reset_index()
ent_counts_final = ent_counts.rename(columns={'index': 'ent', 0:'count'}).sort_values(by='count', ascending=False)
ent_counts_final[:50]

### Name cleaning
Character names will appear in many different variations (i.e. Harry Potter, Harry, Potter, Mr. Potter, etc...). Create a dictionary with a key containing the cleaned name and the values compose a list of the variations.

In [None]:
# Capture name variations in dict
character2name_variants = {'Harry Potter': ['Harry', 'Potter', 'Harry Potter'],
                           'Vernon Dursley': ['Dursley', 'Vernon', 'Uncle Vernon', 'Mr. Dursley', 'Dursleys', 'Vernon Dursley'],
                           'Petunia Dursley': ['Petunia', 'Mrs. Dursley', 'Petunia Dursley', 'Aunt Petunia'],
                           'Dudley Dursley': ['Dudley', 'Dudley Dursley', 'Duddy'], 
                           'Lily Potter': ['Lily', 'Potters', 'Lily Potter'], 
                           'James Potter': ['James', 'Potters', 'James Potter'], 
                           'Albus Dumbledore': ['Dumbledore', 'Albus', 'Albus Dumbledore'], 
                           'Voldemort': ['YouKnow', 'You Know Who', 'He Who Must Not Be Named', 'Voldemort'],
                           'Minerva McGonagall': ['McGonagall', 'Professor McGonagall', 'Minerva', 'Minerva McGonagall'],
                           'Severus Snape': ['Snape', 'Severus', 'Severus Snape', 'Professor Snape'], 
                           'Ron Weasley': ['Ron', 'Weasley', 'Ron Weasley', 'Ronald Weasley'], 
                           'Hermione Granger': ['Hermione', 'Granger', 'Hermione Granger'],
                           'Draco Malfoy': ['Malfoy', 'Draco', 'Draco Malfoy'], 
                           'Vincent Crabbe': ['Crabbe', 'Vincent Crabbe'], 
                           'Gregory Goyle': ['Goyle', 'Gregory Goyle'], 
                           'Hagrid': ['Hagrid', 'Rubeus Hagrid'],
                           'Quirrell': ['Quirrell', 'Professor Quirrell'], 
                           'Percy Weasley': ['Percy', 'Percy Weasley'],
                           'Fred Weasley': ['Fred', 'Fred Weasley'], 
                           'George Weasley': ['George', 'George Weasley'],
                           'Neville Longbottom': ['Neville', 'Longbottom', 'Neville Longbottom']
                }

# switch keys and values - easier for later look up 
name_variant2character = {}
for k, v in character2name_variants.items():
    for variant in v:
        name_variant2character[variant] = k
        

### Find co-occurrence of entities within the same sentence

In [None]:
# build co-occurrence data frame
co_occurrence = pd.DataFrame(columns = ['ent1', 'ent1_type', 'ent2', 'ent2_type'])
for i, sentence in enumerate(sent_ents):
    if len(sentence) == 2:
        k0 = list(sentence[0].keys())[0]
        v0 = list(sentence[0].values())[0]
        k1 = list(sentence[1].keys())[0]
        v1 = list(sentence[1].values())[0]
        co_occurrence = pd.concat([co_occurrence, pd.DataFrame({'ent1': k0, 'ent1_type': v0, 'ent2': k1, 'ent2_type': v1}, index=[i])])
    elif len(sentence) > 2:
        comb = combinations(sentence, 2)
        for c in comb:
            k0 = list(c[0].keys())[0]
            v0 = list(c[0].values())[0]
            k1 = list(c[1].keys())[0]
            v1 = list(c[1].values())[0]
            co_occurrence = pd.concat([co_occurrence, pd.DataFrame({'ent1': k0, 'ent1_type': v0, 'ent2': k1, 'ent2_type': v1}, index=[i])])
    else:
        continue
        

In [None]:
# remove type variables
connections_df = co_occurrence[['ent1', 'ent2']].reset_index().drop(columns=['index'])
connections_df

In [None]:
# standardize names in connections_df and ent_counts
for k, v in name_variant2character.items():
    connections_df = connections_df.replace(k, v)
    ent_counts_final = ent_counts_final.replace(k, v)


In [None]:
# standardize ent counts
grouped = ent_counts_final.groupby('ent').sum('count')
ent_counts_cleaned = grouped.sort_values(by='count', ascending=False).reset_index()
ent_counts_cleaned

In [None]:
# count characters for later visualization effects
character2count = {row['ent']: row['count'] for index, row in ent_counts_cleaned.iterrows()}

In [None]:
# filter by most common names
filtered_connections = connections_df[connections_df['ent1'].isin(character2name_variants.keys()) & connections_df['ent2'].isin(character2name_variants.keys())].reset_index().drop(columns=['index'])
filtered_connections


# Network

In [None]:
### create nodes and relationship lists
# nodes and node count
characters = list(set(list(filtered_connections['ent1'])+list(filtered_connections['ent2'])))
character_count = [character2count[char] for char in characters]

# edges and weights
relationships = [tuple([row['ent1'], row['ent2']]) for index, row in filtered_connections.iterrows()]
relationship2weight = dict(Counter(relationships))
edges_with_weights=[(k[0],k[1],v) for k, v in relationship2weight.items()]

## Visualizations

### 1. Base Network

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_edges_from(relationships)

# draw
nx.draw(G, with_labels=True)


In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_edges_from(relationships)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# base graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False)
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()

### 2. Size nodes according to count

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_edges_from(relationships)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

figure(figsize=(10,6))
pos = nx.spring_layout(G)
### size nodes based on counts
nx.draw(G, pos=pos, with_labels=False, node_size=character_count)
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()

### 3. Weight edges based on relationship counts

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_weighted_edges_from(edges_with_weights)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# get list of weights
weights = list(relationship2weight.values())

# draw graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False, node_size=character_count, width=[0.1*w for w in weights])
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()


## Centrality

In [None]:
# degree centrality
degree_centrality = nx.degree_centrality(G)

# betweenness centrality
betweenness_centrality = nx.betweenness_centrality(G)

# eigenvector centrality
eigenvector_centrality = nx.eigenvector_centrality(G)

# page rank
page_rank = nx.pagerank(G)

### 1. Degree Centrality

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_weighted_edges_from(edges_with_weights)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# get list of weights
weights = list(relationship2weight.values())

# assign centrality score to color map
color_map = [degree_centrality[node] for node in G]

# draw graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False, 
        node_size=character_count, 
        width=[0.1*w for w in weights], 
        node_color = color_map, vmin=0, vmax=max(color_map), cmap = plt.cm.get_cmap('rainbow'))
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()


### 2. Betweenness Centrality

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_weighted_edges_from(edges_with_weights)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# get list of weights
weights = list(relationship2weight.values())

# assign centrality score to color map
color_map = [betweenness_centrality[node] for node in G]

# draw graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False, 
        node_size=character_count, 
        width=[0.1*w for w in weights], 
        node_color = color_map, vmin=0, vmax=max(color_map), cmap = plt.cm.get_cmap('rainbow'))
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()


### 3. Eigenvector Centrality

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_weighted_edges_from(edges_with_weights)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# get list of weights
weights = list(relationship2weight.values())

# assign centrality score to color map
color_map = [eigenvector_centrality[node] for node in G]

# draw graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False, 
        node_size=character_count, 
        width=[0.1*w for w in weights], 
        node_color = color_map, vmin=0, vmax=max(color_map), cmap = plt.cm.get_cmap('rainbow'))
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()


### 4. PageRank

In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_weighted_edges_from(edges_with_weights)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# get list of weights
weights = list(relationship2weight.values())

# assign centrality score to color map
color_map = [page_rank[node] for node in G]

# draw graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False, 
        node_size=character_count, 
        width=[0.1*w for w in weights], 
        node_color = color_map, vmin=0, vmax=max(color_map), cmap = plt.cm.get_cmap('rainbow'))
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()


## Community Detection


In [None]:
# initiate graph object
G = nx.Graph()

# add nodes and edges
G.add_nodes_from(characters)
G.add_weighted_edges_from(edges_with_weights)

# remove self loops
G.remove_edges_from(nx.selfloop_edges(G))

# get list of weights
weights = list(relationship2weight.values())

# run Louvain community detection
louvain = nx_comm.louvain_communities(G)

# color based on run of Louvain results above
color_map = []
for node in G:
    if node in ['Quirrell', 'Severus Snape', 'Voldemort']:
        color_map.append('green')
    elif node in ['Vernon Dursley', 'Petunia Dursley', 'James Potter', 'Dudley Dursley']: 
        color_map.append('gray') 
    elif node in ['Albus Dumbledore', 'Percy Weasley', 'Minerva McGonagall', 'Hermione Granger']:
        color_map.append('yellow')
    elif node in ['Ron Weasley', 'Hagrid', 'Harry Potter']:
        color_map.append('red')
    elif node in ['Draco Malfoy', 'Vincent Crabbe', 'Neville Longbottom', 'George Weasley', 'Gregory Goyle', 'Fred Weasley']:
        color_map.append('blue')
    else:
        color_map.append('tan')

# draw graph
figure(figsize=(10,6))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, with_labels=False, 
        node_size=character_count, 
        width=[0.1*w for w in weights], 
        node_color = color_map)
for node, (x, y) in pos.items():
    text(x, y, node, fontsize=10, ha='center', va='center')
plt.title("Harry Potter and the Sorceror's Stone")
plt.show()
