# GAME OF THRONES: A GRAPH EXPERIMENT



### get the extracted characters informations from pickles

The dictionaries are loaded directly

In [26]:
import pickle
with open('characters.pickle','rb') as fp:
    characters = pickle.load(fp)

In [27]:
print(characters)

[{'short_name': 'Sumner Crakehall', 'common_name': 'Sumner Crakehall', 'fullname': None, 'aliases': None, 'books': [3, 4], 'url': 'Sumner_Crakehall'}, {'short_name': None, 'common_name': 'Lamprey', 'fullname': None, 'aliases': ['Lamprey'], 'books': [3], 'url': 'Lamprey'}, {'short_name': 'Rhaego', 'common_name': 'Rhaego', 'fullname': None, 'aliases': ['The stallion who mounts the world'], 'books': [1, 2, 3, 4], 'url': 'Rhaego'}, {'short_name': 'Jeyne Waters', 'common_name': 'Jeyne Waters', 'fullname': None, 'aliases': None, 'books': [], 'url': 'Jeyne_Waters'}, {'short_name': 'Sybell Spicer', 'common_name': 'Sybell Spicer', 'fullname': None, 'aliases': None, 'books': [3, 4], 'url': 'Sybell_Spicer'}, {'short_name': 'Dennett', 'common_name': 'Dennet', 'fullname': None, 'aliases': None, 'books': [3], 'url': 'Dennet'}, {'short_name': 'Lord Ashford', 'common_name': 'Lord Ashford', 'fullname': None, 'aliases': None, 'books': [], 'url': 'Lord_Ashford'}, {'short_name': 'Selyse Florent', 'common_

In [28]:
with open('characters_links.pickle','rb') as fp:
    character_links = pickle.load(fp)

## Read the chapters and the book they belong to
The books have been cut into chapters themselves into a book directory named GOT{x} where x is the book in order of publication.
We read the whole directory to retain only text files. we create a chapter list containing the text of the chapter and the book index

In [29]:
import os
from os.path import join

chapters = []

for root, dirs, files in os.walk('data'):
    for name in files:
        if name.endswith('txt'):
            book = int(root.split('/')[-1][-1:])
            with open(join(root,name),'r') as fp:
                chapters.append((book,fp.read()))
                
print('found',len(chapters),'chapters')
#print(chapters[1])

found 345 chapters


## For each Chapter, we analyze the entities and prepare the sentences
Each chapter contains some Persons, that might be characters. We save them and their position together with the book they belong to. 
This will allow us to 
* get a distance between persons
* disambiguate the characters using the book they appear in to

In [76]:
def getCandidates(name, shortened, key, equal=True):
    candidates = []
    print(name, key, equal)
    for s in shortened:
        if key != 'aliases':
            if equal and s[key] is not None:
                if name == s[key]:
                    candidates.append(s)
            elif not equal and s[key] is not None:
                if name in s[key]:
                    candidates.append(s)
        else:
            if equal and s['aliases'] is not None:
                for a in s['aliases']:
                    if name == a:
                        candidates.append(s)
            elif not equal and s['aliases'] is not None:
                for a in s['aliases']:
                    if name in a:
                        candidates.append(s)
                
    if len(candidates)>0:
        #if len(candidates)>1:
        #    mvp = sorted(candidates.items(), key=lambda kv: len(kv[1]['books']))[0]
        #    return mvp['common_name']
        #else:
        return candidates[0]['common_name']
    else:
        None

def lemmatize(name, book):
    shortened = []
    for c in characters:
        if c['books'] is not None:
            if book in c['books']:
                shortened.append(c)
    
    priorities = ['common_name', 'short_name', 'fullname', 'aliases']
    
    for priority in priorities:
        candidate = getCandidates(name, shortened, priority)
        if candidate is not None:
            return candidate
    
    for priority in priorities:
        candidate = getCandidates(name, shortened, priority, equal=False)
        if candidate is not None:
            return candidate
    
    return None
    

In [77]:
print(lemmatize('Daenerys',1))

Daenerys common_name True
Daenerys short_name True
Daenerys fullname True
Daenerys aliases True
Daenerys common_name False
Daenerys short_name False
Daenerys fullname False
Daenerys aliases False
None


In [11]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import networkx as nx

STOPLIST = set(["n't", "'s", "'m", "ca"] + list(STOP_WORDS))
#SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", '\t','\n']
nlp = spacy.load('en_core_web_sm')

characters = {}
sentences = [] 
for i in range(len(chapters)):
    if i > 2:
        break
    book, chapter = chapters[i]
    chapter = chapter.strip().replace("\n", " ").replace("\t", " ")
    doc = nlp(chapter)

    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entity = ent.text.replace("'s",'')
            print(entity, lemmatize(entity,book))
            if ent.text not in characters.keys():
                characters[entity] = {'books':set(),'positions':[]}
            
            characters[entity]['books'].add(book)
            characters[entity]['positions'].append((ent.start, ent.end))
            ent.merge(ent.root.tag_,ent.text, ent.label_)
    
    sentence = []
    for token in doc:
       #print(token.ent_type_, token.text, token.pos_)
        if token.ent_type_ == 'PERSON':
            text = token.text.replace(' ', '_').replace("'s",'')
            tag = token.ent_type_
            sentence.append('%s|%s' % (text, tag))
        elif token.pos_ not in ['PUNCT','SPACE']:
            sentence.append(token.lemma_.strip() if token.lemma_ != "-PRON-" else token.lower_)
        elif token.pos_ == 'PUNCT' and token.text == '.':
            sentences.append(sentence)
            sentences.append([word for word in sentence if word not in STOPLIST])
            sentence = []

print(characters.keys())

Ned None


TypeError: string indices must be integers

In [73]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=50, window=5, min_count=1, workers=4)
model.wv.most_similar('Bran|PERSON')

[('by', 0.9998288154602051),
 ('lannister', 0.9998278021812439),
 ('jon', 0.9998244643211365),
 ('Ned|PERSON', 0.9998223781585693),
 ('Robert|PERSON', 0.9998213052749634),
 ('brother', 0.9998184442520142),
 ('stone', 0.9998167157173157),
 ('old', 0.9998142719268799),
 ('great', 0.9998133778572083),
 ('child', 0.9998103380203247)]

In [25]:
import itertools
couples = list(itertools.combinations(characters, 2))
G.add_edges_from(couples)

In [29]:
print(nx.info(G))
nx.write_gexf(G, 'one-chapter.gexf')

Name: 
Type: Graph
Number of nodes: 17
Number of edges: 136
Average degree:  16.0000
