In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt

In [2]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 29.7 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English languague model
NER = spacy.load("en_core_web_sm")
NER.max_length = 1500000

In [4]:
import os
#Get txt file from data folder
book_data = [b for b in os.scandir('data') if '.txt' in b.name]

In [5]:
book_data

[<DirEntry 'A-Song-of-Ice-and-Fire-2-_-Martin_-George-R.-R.-A-Clash-of-Kings-libgen.li.txt'>,
 <DirEntry 'A-Song-of-Ice-and-Fire-A-Targaryen-History-_-Martin_-George-R.R.txt'>]

In [6]:
book = book_data[1]
book_text = open(book, encoding="utf-8").read()
book_doc = NER(book_text)

In [7]:

# Visualize identified entities
displacy.render(book_doc[0:2000], style="ent", jupyter=True)

# Load character names

In [8]:
character_df = pd.read_csv("characters.csv")

In [9]:
# Remove brackets and text within brackets for names wit (extra information) behind it
import re 
character_df['character'] = character_df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) 
#filter firstname from names
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ', 1)[0])

In [10]:
pd.set_option('display.max_rows', None)
character_df

Unnamed: 0.1,Unnamed: 0,character,url,character_firstname
0,0,Boremund Baratheon,https://gameofthrones.fandom.com/wiki/Boremund...,Boremund
1,1,Borros Baratheon,https://gameofthrones.fandom.com/wiki/Borros_B...,Borros
2,2,Ellyn Baratheon,https://gameofthrones.fandom.com/wiki/Ellyn_Ba...,Ellyn
3,3,Floris Baratheon,https://gameofthrones.fandom.com/wiki/Floris_B...,Floris
4,4,Randyll Barret,https://gameofthrones.fandom.com/wiki/Randyll_...,Randyll
5,5,Lyman Beesbury,https://gameofthrones.fandom.com/wiki/Lyman_Be...,Lyman
6,6,Jerrel Bracken,https://gameofthrones.fandom.com/wiki/Jerrel_B...,Jerrel
7,7,Caraxes,https://gameofthrones.fandom.com/wiki/Caraxes,Caraxes
8,8,Arryk Cargyll,https://gameofthrones.fandom.com/wiki/Arryk_Ca...,Arryk
9,9,Erryk Cargyll,https://gameofthrones.fandom.com/wiki/Erryk_Ca...,Erryk


# Get named entity list per sentence

In [11]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [12]:
sent_entity_df

Unnamed: 0,sentence,entities
0,"(Fire, &, Blood, is, a, work, of, fiction, .)",[Fire & Blood]
1,"(Names, ,, places, ,, and, incidents, are, pro...",[]
2,"(Any, resemblance, to, actual, events, ,, loca...",[]
3,"(Copyright, ©, 2018, by, George, R., R., Marti...","[2018, George R. R. Martin, 2018, Penguin Rand..."
4,"(Published, in, the, United, States, by, Banta...","[the United States, Bantam Books, Random House..."
5,"(BANTAM, BOOKS, and, the, HOUSE, colophon, are...","[HOUSE, Penguin Random House]"
6,"(Portions, of, this, book, were, previously, p...","[The World of Ice & Fire, George R. R. Martin,..."
7,"(Hardback, ISBN, , 9781524796280, \n\n, Ebook...","[Virginia Norey, David G. Stevenson\n\nCover]"
8,"(Bastien, Lecouffe, Deharme, \n\n, v5.4, \n\n,...",[Bastien Lecouffe Deharme\n\nv5.4\n\nep]
9,"(Conquest, \n\n, Reign, of, the, Dragon, —, Th...",[]


In [13]:
# Function to filter out non-character entities
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(character_df.character) 
            or ent in list(character_df.character_firstname)]

In [15]:
filter_entity(["Boremund", "Thu", "2"], character_df)

['Boremund']

In [16]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered.head(10)

Unnamed: 0,sentence,entities,character_entities
11,"(AC, \n\n, A, Surfeit, of, Rulers, \n\n, A, Ti...","[Betrayal, Alysanne—Their Triumphs and Tragedi...",[Aegon]
15,"(More, than, two, years, passed, between, Aego...","[More than two years, Aegon, Oldtown, Conquest...",[Aegon]
16,"(Sporadic, attempts, to, bring, the, Dornishme...",[Aegon],[Aegon]
20,"(The, day, of, Aegon, ’s, Landing, was, celebr...","[Aegon, Conqueror, the day, the Starry Sept of...",[Aegon]
21,"(This, coronation, took, place, two, years, af...","[two years, Aegon, three, the Wars of Conquest]",[Aegon]
22,"(Thus, it, can, be, seen, that, most, of, Aego...","[Aegon, BC, Conquest]",[Aegon]
35,"(Gaemon, ’s, son, Aegon, and, his, daughter, E...",[Aegon],[Aegon]
38,"(The, Aegon, who, would, be, known, to, histor...","[Aegon, Aegon, 27, BC]","[Aegon, Aegon]"
40,"(Aegon, had, two, trueborn, siblings, ;, an, e...","[Aegon, two, Visenya, Rhaenys]","[Aegon, Rhaenys]"
41,"(It, had, long, been, the, custom, amongst, th...","[Valyria, Aegon]",[Aegon]


In [17]:
# Take only first name of characters (bvb Aegon,Aegon)
sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0] 
                                                                                                               for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0]


In [18]:
pd.reset_option('^display.', silent=True)
sent_entity_df_filtered

Unnamed: 0,sentence,entities,character_entities
11,"(AC, \n\n, A, Surfeit, of, Rulers, \n\n, A, Ti...","[Betrayal, Alysanne—Their Triumphs and Tragedi...",[Aegon]
15,"(More, than, two, years, passed, between, Aego...","[More than two years, Aegon, Oldtown, Conquest...",[Aegon]
16,"(Sporadic, attempts, to, bring, the, Dornishme...",[Aegon],[Aegon]
20,"(The, day, of, Aegon, ’s, Landing, was, celebr...","[Aegon, Conqueror, the day, the Starry Sept of...",[Aegon]
21,"(This, coronation, took, place, two, years, af...","[two years, Aegon, three, the Wars of Conquest]",[Aegon]
...,...,...,...
11901,"(“, Lord, Stackspear, ,, Lord, Grandison, ,, L...","[Grandison, Merryweather]",[Merryweather]
11920,"(Lineages, and, Family, Tree, \n\n\n\n\n\n, 1–...","[1–37, Dragon, 37–42, 42–48, Maegor I, Visenya...",[Rhaenyra]
11922,"(131–157, \n\n\n, Aegon, III, \n\n\n\n, the, D...","[131–157, Aegon III, Rhaenyra, Aegon]","[Rhaenyra, Aegon]"
11927,"(161–171, \n\n\n, Baelor, I, \n\n\n\n, the, Be...","[161–171, septon, Aegon, 171–172, Viserys II, ...","[Aegon, Aegon]"


# Create relationships