# GAME OF THRONES: A GRAPH EXPERIMENT



In [99]:
import requests
from bs4 import BeautifulSoup

r = requests.get('https://awoiaf.westeros.org/index.php/List_of_characters')
soup = BeautifulSoup(r.text)

### get all the link that point to a character in the wiki
the get the links we browse the HTML looking for 

```html
    <a href="">name</a>
```
tag and get only those who follow a certain template. 
Then we have a list of pages to get as tuples (href, name)

In [117]:
pages = []

links = soup.find_all('a')
S1 = set(['class', 'tabindex','accesskey','id'])
for link in links:
    if link.parent.name == 'li': 
        S2 = set(link.attrs.keys())
        if not bool(S1 & S2):
            href = link['href']
            if 'A_Wiki_of_Ice_and_Fire:' not in href and 'Category:' not in href:
                characters_page = 'https://awoiaf.westeros.org' + href
                pages.append((characters_page,href.split('/')[-1]))

### request all the pages and save them for future analysis
The pages are downloaded one by one with a pause of one second to not overload the server. They are saved for the analysis of the infobox

In [128]:
import time
import os
from os.path import join

already_done = set()
for root, dirs, files in os.walk('.'):
    for name in files:
        if name.endswith('html'):
            already_done.add(name.split('.')[0])
to_be_done = set()
to_be_done_pages = {}
for href, name in pages:
    to_be_done.add(name)
    to_be_done_pages[name] = href
to_be_done = to_be_done - already_done

In [129]:
for name in to_be_done:
    href = to_be_done_pages[name]
    r = requests.get(href)
    with open(name+'.html','w+') as fp:
        fp.write(r.text)
        time.sleep(1)

### Read the characters html files and get the infobox informations
We are after a series of informations, namely:
* __books__ but only in the list of the five book published. used to disambiguate
* __name__ for obvious reasons.
* __aliases__ to get alternative use of the name
* __family__ OPTIONAL for use in the graph as a relationship in the future


In [2]:
import pickle
with open('characters.pickle','rb') as fp:
    characters = pickle.load(fp)

In [97]:
import json
import re

regex_alias = r"([a-z])(\s\s)?([A-Z])"
subst_alias = r"\1,\3"



{'Addam Marbrand': {'alias': [], 'books': {1, 2, 3, 4, 5}}, 'Addison Hill': {'alias': [], 'books': {4}}, 'Aegon Blackfyre': {'alias': [], 'books': {3}}, 'Aegon Frey': {'alias': ['Jinglebell', 'Aegon Bloodborn'], 'books': {2, 3, 4, 5}}, 'Aegon Targaryen': {'alias': ["Young Griff (possible)The Mummer's Dragon"], 'books': {1, 2, 3, 4, 5}}, 'Aegor Rivers': {'alias': ['Bittersteel'], 'books': {4, 5}}, 'Rhaenyra Targaryen': {'alias': ["The Realm's Delight", 'The Whore of Dragonstone', 'King Maegor with Teats', "Maegor's Teats", 'The Half-Year Queen'], 'books': {3, 4, 5}}, 'Aemon Blackfyre': {'alias': [], 'books': {3}}, 'Aemon Costayne': {'alias': [], 'books': {3}}, 'Aemon Estermont': {'alias': [], 'books': {2, 3, 4, 5}}, 'Aemon Rivers': {'alias': [], 'books': {2, 3, 4}}, 'Aemon Targaryen': {'alias': ['The Dragonknight', 'The Knight of Tears', 'Aemon Targaryen', 'Maester Aemon', 'Uncle Maester'], 'books': {1, 2, 3, 4, 5}}, 'Aemond Targaryen': {'alias': ['Aemond One-Eye Aemond the Kinslayer'],

## Read the chapters and the book they belong to
The books have been cut into chapters themselves into a book directory named GOT{x} where x is the book in order of publication.
We read the whole directory to retain only text files. we create a chapter list containing the text of the chapter and the book index

In [78]:
import os
from os.path import join

chapters = []

for root, dirs, files in os.walk('data'):
    for name in files:
        if name.endswith('txt'):
            book = int(root.split('/')[-1][-1:])
            with open(join(root,name),'r') as fp:
                chapters.append((book,fp.read()))
                
print('found',len(chapters),'chapters')
#print(chapters[1])

found 345 chapters


## For each Chapter, we analyze the entities and prepare the sentences
Each chapter contains some Persons, that might be characters. We save them and their position together with the book they belong to. 
This will allow us to 
* get a distance between persons
* disambiguate the characters using the book they appear in to

In [77]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import networkx as nx

STOPLIST = set(["n't", "'s", "'m", "ca"] + list(STOP_WORDS))
#SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", '\t','\n']
nlp = spacy.load('en_core_web_sm')

characters = {}
sentences = [] 
for i in range(len(chapters)):
    if i > 2:
        break
    book, chapter = chapters[i]
    chapter = chapter.strip().replace("\n", " ").replace("\t", " ")
    doc = nlp(chapter)

    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entity = ent.text.replace("'s",'')
            if ent.text not in characters.keys():
                characters[entity] = {'books':set(),'positions':[]}
            
            characters[entity]['books'].add(book)
            characters[entity]['positions'].append((ent.start, ent.end))
            ent.merge(ent.root.tag_,ent.text, ent.label_)
    
    sentence = []
    for token in doc:
       #print(token.ent_type_, token.text, token.pos_)
        if token.ent_type_ == 'PERSON':
            text = token.text.replace(' ', '_').replace("'s",'')
            tag = token.ent_type_
            sentence.append('%s|%s' % (text, tag))
        elif token.pos_ not in ['PUNCT','SPACE']:
            sentence.append(token.lemma_.strip() if token.lemma_ != "-PRON-" else token.lower_)
        elif token.pos_ == 'PUNCT' and token.text == '.':
            sentences.append(sentence)
            sentences.append([word for word in sentence if word not in STOPLIST])
            sentence = []
            
print(characters.keys())

dict_keys(['Ned', 'Sandor Clegane', 'Robert', 'Ned knelt', 'Eddard', 'Jaime', 'Neck', 'Starks', 'Highgarden', 'Flowers', 'Robert Baratheon', 'Eddard Stark', 'Brandon', 'Kings', 'Rickard Stark', 'Catelyn Tully', 'Lyanna', 'Howland Reed', 'Targaryen', 'Grace', 'Jon', 'Eyrie', 'Tywin', 'Lannisters', 'Robert Arryn', 'Benjen', 'Jon Arryn', 'Wardens', 'Joff', 'Dothraki', 'Ser Jorah', 'Viserys', 'Faster', 'Daenerys Targaryen', 'Rhaegar', 'Jhiqui', 'Jorah Mormont', 'Mirri Maz Duur', 'Khaleesi', 'Irri', 'Maz Duur', 'Drink', 'Dany', 'Dragondew', 'Khal Drogo', 'Doreah', 'Cold', 'Jorah', 'Iron Lord', 'Aggo', 'Rakharo', 'Jhogo', 'Ko Pono', 'Khal Pono', 'Mago', 'Khal Jhaqo', 'Ko Jhaqo', 'Eroeh', 'Daenerys Stormborn', 'Cruel', 'Valyria', 'the Great Shepherd', 'Fly', 'Bran', 'Maester Luwin', 'Robb', 'Bite', 'Vaes Dothrak', 'Wall', 'North'])


In [73]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=50, window=5, min_count=1, workers=4)
model.wv.most_similar('Bran|PERSON')

[('by', 0.9998288154602051),
 ('lannister', 0.9998278021812439),
 ('jon', 0.9998244643211365),
 ('Ned|PERSON', 0.9998223781585693),
 ('Robert|PERSON', 0.9998213052749634),
 ('brother', 0.9998184442520142),
 ('stone', 0.9998167157173157),
 ('old', 0.9998142719268799),
 ('great', 0.9998133778572083),
 ('child', 0.9998103380203247)]

In [25]:
import itertools
couples = list(itertools.combinations(characters, 2))
G.add_edges_from(couples)

In [29]:
print(nx.info(G))
nx.write_gexf(G, 'one-chapter.gexf')

Name: 
Type: Graph
Number of nodes: 17
Number of edges: 136
Average degree:  16.0000
