In [25]:
import os
import re
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', 50)

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [3]:
nlp = spacy.load('en_core_web_md')

# Approach

* Read source
* Split into sentences
* Extract names of characters
* Go through sentences and sum up how often two characters are referenced within x (window size) sentences

In [7]:
# Read the book txt files that are in the data directory
book_files = [b for b in os.scandir('data')]
book_files

[<DirEntry 'I - Blood of Elves.txt'>,
 <DirEntry 'B - The Sword of Destiny.txt'>,
 <DirEntry 'II - Times of Contempt.txt'>,
 <DirEntry 'E - something ends something begins.txt'>,
 <DirEntry 'IV - The Tower of the Swallow.txt'>,
 <DirEntry 'C - The Last Wish.txt'>,
 <DirEntry 'V - The Lady of the Lake.txt'>,
 <DirEntry 'III - Baptism of Fire.txt'>]

In [19]:
first_book = book_files[0]
first_book_text = open(first_book).read()
first_book_doc = nlp(first_book_text)

In [None]:
displacy.render(first_book_doc[2000:4000], style='ent', jupyter=True)

### Read character list

In [22]:
df_characters = pd.read_csv('characters.csv')
df_characters

Unnamed: 0,book,character
0,Category:Baptism of Fire characters,Adalia
1,Category:Baptism of Fire characters,Adela
2,Category:Baptism of Fire characters,Aen Saevherne
3,Category:Baptism of Fire characters,Aevenien
4,Category:Baptism of Fire characters,Aglaïs
...,...,...
1269,Category:Time of Contempt characters,Yanna of Murivel
1270,Category:Time of Contempt characters,Yarpen Zigrin
1271,Category:Time of Contempt characters,Yennefer of Vengerberg
1272,Category:Time of Contempt characters,Yiolenta Suarez


In [26]:
# remove annotations like "Aubry ->(first born)<-"
df_characters['character'] = df_characters['character'].apply(lambda name: name.split('(', 1)[0].strip())
# save first names because characters are often referred to by first name only
df_characters['character_first_name'] = df_characters['character'].apply(lambda name: name.split(' ', 1)[0])
df_characters

Unnamed: 0,book,character,character_first_name
0,Category:Baptism of Fire characters,Adalia,Adalia
1,Category:Baptism of Fire characters,Adela,Adela
2,Category:Baptism of Fire characters,Aen Saevherne,Aen
3,Category:Baptism of Fire characters,Aevenien,Aevenien
4,Category:Baptism of Fire characters,Aglaïs,Aglaïs
...,...,...,...
1269,Category:Time of Contempt characters,Yanna of Murivel,Yanna
1270,Category:Time of Contempt characters,Yarpen Zigrin,Yarpen
1271,Category:Time of Contempt characters,Yennefer of Vengerberg,Yennefer
1272,Category:Time of Contempt characters,Yiolenta Suarez,Yiolenta
