In [1]:
import os
import re
from typing import List

import pandas as pd
import numpy as np
import spacy
from pandas import DataFrame
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
pd.set_option('display.max_rows', 50)

In [None]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
nlp = spacy.load('en_core_web_md')

# Approach

* Read source
* Split into sentences
* Extract names of characters
* Go through sentences and sum up how often two characters are referenced within x (window size) sentences

In [None]:
# Read the book txt files that are in the data directory
book_files = [b for b in os.scandir('data')]
book_files

[<DirEntry 'I - Blood of Elves.txt'>,
 <DirEntry 'B - The Sword of Destiny.txt'>,
 <DirEntry 'II - Times of Contempt.txt'>,
 <DirEntry 'E - something ends something begins.txt'>,
 <DirEntry 'IV - The Tower of the Swallow.txt'>,
 <DirEntry 'C - The Last Wish.txt'>,
 <DirEntry 'V - The Lady of the Lake.txt'>,
 <DirEntry 'III - Baptism of Fire.txt'>]

In [None]:
first_book = book_files[0]
first_book_text = open(first_book).read()
first_book_text = first_book_text.replace('\n', '')#.replace("'", "")
first_book_text[:500]

"\ufeffVerily I say unto you, the era of the sword and axe is nigh, the era of the wolf's blizzard. The Time of the White Chill and the White Light is nigh, the Time of Madness and the Time of Contempt: Tedd Deireddh, the Time of End. The world will die amidst frost and be reborn with the new sun. It will be reborn of the Elder Blood, of Hen Ichaer, of the seed that has been sown. A seed which will not sprout but will burst into flame. Ess'tuath esse! Thus it shall be! Watch for the signs! What signs "

In [None]:
first_book_doc = nlp(first_book_text)
displacy.render(first_book_doc[2000:4000], style='ent', jupyter=True)

### Read character list

In [None]:
df_characters = pd.read_csv('characters.csv')
df_characters

Unnamed: 0,book,character
0,Category:Baptism of Fire characters,Adalia
1,Category:Baptism of Fire characters,Adela
2,Category:Baptism of Fire characters,Aen Saevherne
3,Category:Baptism of Fire characters,Aevenien
4,Category:Baptism of Fire characters,Aglaïs
...,...,...
1269,Category:Time of Contempt characters,Yanna of Murivel
1270,Category:Time of Contempt characters,Yarpen Zigrin
1271,Category:Time of Contempt characters,Yennefer of Vengerberg
1272,Category:Time of Contempt characters,Yiolenta Suarez


In [None]:
# remove annotations like "Aubry ->(first born)<-"
df_characters['character'] = df_characters['character'].apply(lambda name: name.split('(', 1)[0].strip())
# save first names because characters are often referred to by first name only
df_characters['character_first_name'] = df_characters['character'].apply(lambda name: name.split(' ', 1)[0])
df_characters

Unnamed: 0,book,character,character_first_name
0,Category:Baptism of Fire characters,Adalia,Adalia
1,Category:Baptism of Fire characters,Adela,Adela
2,Category:Baptism of Fire characters,Aen Saevherne,Aen
3,Category:Baptism of Fire characters,Aevenien,Aevenien
4,Category:Baptism of Fire characters,Aglaïs,Aglaïs
...,...,...,...
1269,Category:Time of Contempt characters,Yanna of Murivel,Yanna
1270,Category:Time of Contempt characters,Yarpen Zigrin,Yarpen
1271,Category:Time of Contempt characters,Yennefer of Vengerberg,Yennefer
1272,Category:Time of Contempt characters,Yiolenta Suarez,Yiolenta


In [None]:
# This is not good obviously
print(df_characters[df_characters['character_first_name'] == 'Two'])
# Exception for this name: no first name
df_characters.loc[df_characters['character'] == 'Two Tusks', 'character_first_name'] = ""
print(df_characters[df_characters['character'] == 'Two Tusks'])

                                      book  character character_first_name
1254  Category:Time of Contempt characters  Two Tusks                  Two
                                      book  character character_first_name
1254  Category:Time of Contempt characters  Two Tusks                     


In [None]:
# Some expert knowledge: 'Emiel Regis Rohellec Terzieff-Godefroy' is most often referred to as 'Regis'
df_characters.loc[df_characters['character'] == 'Emiel Regis Rohellec Terzieff-Godefroy', 'character_first_name'] = "Regis"
print(df_characters[df_characters['character'] == 'Emiel Regis Rohellec Terzieff-Godefroy'])

                                             book  \
48            Category:Baptism of Fire characters   
646      Category:The Lady of the Lake characters   
944  Category:The Tower of the Swallow characters   

                                  character character_first_name  
48   Emiel Regis Rohellec Terzieff-Godefroy                Regis  
646  Emiel Regis Rohellec Terzieff-Godefroy                Regis  
944  Emiel Regis Rohellec Terzieff-Godefroy                Regis  


In [None]:
# Rename imposter Ciri
print(df_characters[df_characters['character'] == 'Cirilla Fiona'])
df_characters.loc[df_characters['character'] == 'Cirilla Fiona', 'character'] = "Imposter Cirilla"
df_characters.loc[df_characters['character'] == 'Imposter Cirilla', 'character_first_name'] = ""
print(df_characters[df_characters['character'] == 'Imposter Cirilla'])

                                              book      character  \
27             Category:Baptism of Fire characters  Cirilla Fiona   
613       Category:The Lady of the Lake characters  Cirilla Fiona   
923   Category:The Tower of the Swallow characters  Cirilla Fiona   
1107          Category:Time of Contempt characters  Cirilla Fiona   

     character_first_name  
27                Cirilla  
613               Cirilla  
923               Cirilla  
1107              Cirilla  
                                              book         character  \
27             Category:Baptism of Fire characters  Imposter Cirilla   
613       Category:The Lady of the Lake characters  Imposter Cirilla   
923   Category:The Tower of the Swallow characters  Imposter Cirilla   
1107          Category:Time of Contempt characters  Imposter Cirilla   

     character_first_name  
27                         
613                        
923                        
1107                       


In [None]:
# Ciris full name is somehow not in the dataset
df_characters.loc[df_characters['character_first_name'] == 'Ciri', 'character'] = "Cirilla Fiona Elen Riannon"
print(df_characters[df_characters['character_first_name'] == 'Ciri'])

                                                   book  \
26                  Category:Baptism of Fire characters   
162                  Category:Blood of Elves characters   
287                Category:Season of Storms characters   
384   Category:Something Ends, Something Begins char...   
461                Category:Sword of Destiny characters   
611            Category:The Lady of the Lake characters   
612            Category:The Lady of the Lake characters   
922        Category:The Tower of the Swallow characters   
1106               Category:Time of Contempt characters   

                       character character_first_name  
26    Cirilla Fiona Elen Riannon                 Ciri  
162   Cirilla Fiona Elen Riannon                 Ciri  
287   Cirilla Fiona Elen Riannon                 Ciri  
384   Cirilla Fiona Elen Riannon                 Ciri  
461   Cirilla Fiona Elen Riannon                 Ciri  
611   Cirilla Fiona Elen Riannon                 Ciri  
612   Cirilla Fio

In [None]:
# Add at least one instance of Ciris correct first name to catch these rare instances
idx = (df_characters['character_first_name'] == 'Ciri').idxmax()
df_characters.loc[idx, 'character_first_name'] = 'Cirilla'
print(df_characters[df_characters['character_first_name'] == 'Cirilla'])

                                                  book  \
26                 Category:Baptism of Fire characters   
385  Category:Something Ends, Something Begins char...   

                      character character_first_name  
26   Cirilla Fiona Elen Riannon              Cirilla  
385                     Cirilla              Cirilla  


In [None]:
# Remove Dryad Ciri
df_characters = df_characters.drop(df_characters[df_characters['character'] == 'Cirilla'].index)
print(df_characters[df_characters['character_first_name'] == 'Cirilla'])

                                   book                   character  \
26  Category:Baptism of Fire characters  Cirilla Fiona Elen Riannon   

   character_first_name  
26              Cirilla  


### Named characters per sentence

In [None]:
entities_per_sentence = []
for sentence in first_book_doc.sents:
    entities_in_sentence: List[str] = [e.text for e in sentence.ents]
    entities_per_sentence.append({'sentence': sentence, 'entities': entities_in_sentence})

df_entities_in_sentences = pd.DataFrame(entities_per_sentence)

In [None]:
df_entities_in_sentences.head(50)

Unnamed: 0,sentence,entities
0,"(﻿Verily, I, say, unto, you, ,, the, era, of, ...",[]
1,"(The, Time, of, the, White, Chill, and, the, W...","[the Time of Madness, the Time of Contempt:, T..."
2,"(The, world, will, die, amidst, frost, and, be...",[]
3,"(It, will, be, reborn, of, the, Elder, Blood, ...","[the Elder Blood, Hen Ichaer]"
4,"(A, seed, which, will, not, sprout, but, will,...",[]
5,"(Ess'tuath, esse, !)",[]
6,"(Thus, it, shall, be, !)",[]
7,"(Watch, for, the, signs, !)",[]
8,"(What, signs, these, shall, be, ,, I, say, unt...","[first, Aen Seidhe, the Blood of Elves]"
9,"(Aen, Ithlinnespeath, ,, Ithlinne, Aegli, aep,...","[Aen Ithlinnespeath, Ithlinne Aegli]"


In [None]:
# filter entities: only keep entities that are present in the names df

def filter_entities(all_entities: List[str], characters: DataFrame) -> List[str]:
    by_name = [e for e in all_entities if e in list(characters['character'])]
    by_first_name = [characters[characters['character_first_name'] == e]['character'].iloc[0] for e in all_entities if e in list(characters['character_first_name']) and e not in by_name]
    return by_name + by_first_name

filter_entities(['Papa Schlumpf', 'Triss Merigold', 'Cahir', 'Cintra', 'Novigrad', '3', 'Emiel Regis Rohellec Terzieff-Godefroy', 'Emiel', 'Regis'], df_characters)

['Triss Merigold',
 'Emiel Regis Rohellec Terzieff-Godefroy',
 'Cahir Mawr Dyffryn aep Ceallach',
 'Emiel Regis Rohellec Terzieff-Godefroy']

In [None]:
df_entities_in_sentences['character_entities'] = df_entities_in_sentences['entities'].apply(lambda ent_list: filter_entities(ent_list, df_characters))
df_entities_in_sentences.head(50)

Unnamed: 0,sentence,entities,character_entities
0,"(﻿Verily, I, say, unto, you, ,, the, era, of, ...",[],[]
1,"(The, Time, of, the, White, Chill, and, the, W...","[the Time of Madness, the Time of Contempt:, T...",[]
2,"(The, world, will, die, amidst, frost, and, be...",[],[]
3,"(It, will, be, reborn, of, the, Elder, Blood, ...","[the Elder Blood, Hen Ichaer]",[]
4,"(A, seed, which, will, not, sprout, but, will,...",[],[]
5,"(Ess'tuath, esse, !)",[],[]
6,"(Thus, it, shall, be, !)",[],[]
7,"(Watch, for, the, signs, !)",[],[]
8,"(What, signs, these, shall, be, ,, I, say, unt...","[first, Aen Seidhe, the Blood of Elves]",[]
9,"(Aen, Ithlinnespeath, ,, Ithlinne, Aegli, aep,...","[Aen Ithlinnespeath, Ithlinne Aegli]",[]


In [None]:
df_entities_in_sentences = df_entities_in_sentences[df_entities_in_sentences['character_entities'].map(len) > 0]
df_entities_in_sentences.head(10)

Unnamed: 0,sentence,entities,character_entities
178,"(Geralt, of, Rivia, ,, the, White, Wolf, ,, an...","[Geralt of Rivia, the White Wolf]",[Geralt of Rivia]
213,"(Geralt, the, Witcher, ,, who, had, found, her...","[Geralt, Witcher]",[Geralt of Rivia]
222,"(', Thank, you, ,, Master, Dandelion, ,, ', he...",[Dandelion],[Dandelion]
223,"(', Allow, me, ,, Radcliffe, of, Oxenfurt, ,, ...","[Radcliffe, Arcana]",[Radcliffe]
236,"(Wizard, Radcliffe, lost, himself, in, quiet, ...","[Radcliffe, Vilibert]",[Radcliffe]
266,"(Wizard, Radcliffe, lost, himself, in, quiet, ...","[Radcliffe, Vilibert]",[Radcliffe]
290,"(Master, Dandelion, bestowed, an, approving, s...",[Dandelion],[Dandelion]
291,"(', Master, !, ', shouted, a, sizeable, woman,...",[Vera Loewenhaupt],[Vera Loewenhaupt]
300,"(The, tradeswoman, did, n't, give, up, ,, gene...",[Dandelion],[Dandelion]
302,"(Your, songs, mention, no, names, ,, but, we, ...","[Geralt of Rivia, Yennefer]","[Geralt of Rivia, Yennefer of Vengerberg]"


### Relationships

In [21]:
WINDOW_SIZE = 4

last_index = df_entities_in_sentences.index[-1] - WINDOW_SIZE
#last_index = 100 - WINDOW_SIZE
relationships = []

for i in range(last_index):
    window_end_i = i + WINDOW_SIZE
    character_in_window = df_entities_in_sentences.iloc[i:window_end_i]['character_entities'].sum()
    character_in_window = list(set(character_in_window))
    print(character_in_window)

    for idx in range(len(character_in_window)):
        character_a = character_in_window[idx]
        #character_b = character_in_window[idx + 1]
        rels = [{'source': character_a, 'target': character_b} if character_a<character_b else {'source': character_b, 'target': character_a} for character_b in character_in_window[idx + 1:]]
        if len(rels) > 0:
            relationships.extend(rels)
            print(f"Relationships: f{rels}")

['Dandelion', 'Radcliffe', 'Geralt of Rivia']
Relationships: f[{'source': 'Dandelion', 'target': 'Radcliffe'}, {'source': 'Dandelion', 'target': 'Geralt of Rivia'}]
Relationships: f[{'source': 'Geralt of Rivia', 'target': 'Radcliffe'}]
['Dandelion', 'Radcliffe', 'Geralt of Rivia']
Relationships: f[{'source': 'Dandelion', 'target': 'Radcliffe'}, {'source': 'Dandelion', 'target': 'Geralt of Rivia'}]
Relationships: f[{'source': 'Geralt of Rivia', 'target': 'Radcliffe'}]
['Radcliffe', 'Dandelion']
Relationships: f[{'source': 'Dandelion', 'target': 'Radcliffe'}]
['Dandelion', 'Radcliffe']
Relationships: f[{'source': 'Dandelion', 'target': 'Radcliffe'}]
['Vera Loewenhaupt', 'Dandelion', 'Radcliffe']
Relationships: f[{'source': 'Dandelion', 'target': 'Vera Loewenhaupt'}, {'source': 'Radcliffe', 'target': 'Vera Loewenhaupt'}]
Relationships: f[{'source': 'Dandelion', 'target': 'Radcliffe'}]
['Vera Loewenhaupt', 'Dandelion', 'Radcliffe']
Relationships: f[{'source': 'Dandelion', 'target': 'Vera L

TypeError: 'int' object is not iterable

In [22]:
relationships[:50]

[{'source': 'Dandelion', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Geralt of Rivia'},
 {'source': 'Geralt of Rivia', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Geralt of Rivia'},
 {'source': 'Geralt of Rivia', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Vera Loewenhaupt'},
 {'source': 'Radcliffe', 'target': 'Vera Loewenhaupt'},
 {'source': 'Dandelion', 'target': 'Radcliffe'},
 {'source': 'Dandelion', 'target': 'Vera Loewenhaupt'},
 {'source': 'Radcliffe', 'target': 'Vera Loewenhaupt'},
 {'source': 'Dandelion', 'target': 'Radcliffe'},
 {'source': 'Vera Loewenhaupt', 'target': 'Yennefer of Vengerberg'},
 {'source': 'Geralt of Rivia', 'target': 'Vera Loewenhaupt'},
 {'source': 'Dandelion', 'target': 'Vera Loewenhaupt'},
 {'source': 'Geralt of Rivia', 'target': 'Yennefer of Vengerberg'},
 {'sour

In [23]:
df_relationships = pd.DataFrame(relationships)
df_relationships.sort_values(by=['source', 'target'])

Unnamed: 0,source,target
154,Abrad the Old Oak,Dezmod
166,Abrad the Old Oak,Dezmod
181,Abrad the Old Oak,Dezmod
192,Abrad the Old Oak,Dezmod
174,Abrad the Old Oak,Radcliffe
...,...,...
1596,Yarpen Zigrin,Yennefer of Vengerberg
1808,Yarpen Zigrin,Yennefer of Vengerberg
1813,Yarpen Zigrin,Yennefer of Vengerberg
1819,Yarpen Zigrin,Yennefer of Vengerberg


In [None]:
df_relationships = df_relationships.groupby(by=['source', 'target'], sort=False,).size().reset_index(name='value')
df_relationships

### Network Visualization

In [None]:
graph = nx.from_pandas_edgelist(df=df_relationships, source='source', target='target', edge_attr='value', create_using=nx.Graph())

In [None]:
plt.figure(figsize=(10,10))
positions = nx.kamada_kawai_layout(graph)
nx.draw(graph, with_labels=True, pos=positions, node_color='skyblue', edge_cmap=plt.cm.Blues)

### Interactive with PyVis

In [None]:
network = Network(notebook=True, height='1200px', width='1400px', bgcolor='#222222', font_color='white')

#Prevent Network from wobbeling around
network.set_options("""
const options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -36000,
      "springLength": 440
    },
    "minVelocity": 0.75
  }
}
""")

# Show physics settings
#network.show_buttons(filter_=['physics'])

# Make Nodes larger if they have more connections
node_degrees = dict(graph.degree)
nx.set_node_attributes(graph, node_degrees, 'size')


network.from_nx(graph)
network.show('net.html')

### Communities

In [None]:
import community as community_louvain

In [None]:
communities = community_louvain.best_partition(graph)
communities

In [None]:
network = Network(notebook=True, height='1200px', width='1400px', bgcolor='#222222', font_color='white')

#Prevent Network from wobbeling around
network.set_options("""
const options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -36000,
      "springLength": 440
    },
    "minVelocity": 0.75
  }
}
""")

# Show physics settings
#network.show_buttons(filter_=['physics'])

# Make Nodes larger if they have more connections
node_degrees = dict(graph.degree)
nx.set_node_attributes(graph, node_degrees, 'size')
nx.set_node_attributes(graph, communities, 'group')


network.from_nx(graph)
network.show('net.html')

In [30]:
import community as community_louvain

In [32]:
communities = community_louvain.best_partition(graph)
communities

{'Dandelion': 0,
 'Radcliffe': 1,
 'Geralt of Rivia': 2,
 'Vera Loewenhaupt': 1,
 'Yennefer of Vengerberg': 3,
 'Cirilla Fiona Elen Riannon': 3,
 'Sheldon Skaggs': 1,
 'Donimir of Troy': 1,
 'Foltest': 4,
 'Pavetta': 4,
 'Eist Tuirseach': 4,
 'Abrad the Old Oak': 1,
 'Sambuk': 1,
 'Dezmod': 1,
 'Mama Lantieri': 1,
 'Rience': 0,
 'Gruzila': 0,
 'Calanthe': 4,
 'Niedamir': 4,
 'Vizimir II': 4,
 'Roach': 5,
 'Eskel': 3,
 'Lambert': 3,
 'Nicodemus de Boot': 0,
 'Triss Merigold': 5,
 'Vesemir': 3,
 'Nenneke': 3,
 'Vanielle': 3,
 'Axel Esparza': 3,
 'Fercart': 3,
 'Keira Metz': 3,
 'Artaud Terranova': 3,
 'Vilgefortz': 3,
 'Yarpen Zigrin': 5,
 'Paulie Dahlberg': 5,
 'Regan Dahlberg': 5,
 'Yannick Brass': 5,
 'Xavier Moran': 5,
 'Lucas Corto': 5,
 'Vilfrid Wenck': 5,
 'Henselt': 4,
 'Schrader': 5,
 'Zyvik': 5,
 'King of the Wild Hunt': 5,
 'Aelirenn': 2,
 'Radovid III': 2,
 'Everett': 2,
 'Iola': 3,
 'Boatbug': 2,
 'Linus Pitt': 2,
 'Boratek': 2,
 'Olsen': 2,
 'Stran': 2,
 'Shani': 0,
 'Ori R

In [34]:
network = Network(notebook=True, height='1200px', width='1400px', bgcolor='#222222', font_color='white')

#Prevent Network from wobbeling around
network.set_options("""
const options = {
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -36000,
      "springLength": 440
    },
    "minVelocity": 0.75
  }
}
""")

# Show physics settings
#network.show_buttons(filter_=['physics'])

# Make Nodes larger if they have more connections
node_degrees = dict(graph.degree)
nx.set_node_attributes(graph, node_degrees, 'size')
nx.set_node_attributes(graph, communities, 'group')


network.from_nx(graph)
network.show('net.html')