# GAME OF THRONES: A GRAPH EXPERIMENT



### get the extracted characters informations from pickles

The dictionaries are loaded directly

In [9]:
import pickle
with open('characters.pickle','rb') as fp:
    characters = pickle.load(fp)

In [10]:
print(characters[5])

{'aliases': None, 'awoif_in_degree': 0, 'awoif_infobox_length': 0, 'awoif_links': ['Sandor_Clegane', 'Beric_Dondarrion', 'Anguy', 'Kyle', 'Thoros_of_Myr'], 'awoif_out_degree': 5, 'awoif_page_size': 22295, 'books': [3], 'category': 'appears', 'common_name': 'Dennet', 'fullname': None, 'short_name': 'Dennett', 'titles': None, 'url': 'Dennet', 'score': 0.028193470151365908}


## Read the chapters and the book they belong to
The books have been cut into chapters themselves into a book directory named GOT{x} where x is the book in order of publication.
We read the whole directory to retain only text files. we create a chapter list containing the text of the chapter and the book index

In [11]:
import os
from os.path import join

chapters = []

for root, dirs, files in os.walk('data'):
    for name in files:
        if name.endswith('txt'):
            book = int(root.split('/')[-1][-1:])
            with open(join(root,name),'r') as fp:
                chapters.append((book,fp.read()))
                
print('found',len(chapters),'chapters')
#print(chapters[1])

found 345 chapters


## Lemmatize a person entity : Get Most Probable Character for an incomplete NLP person entity
Check partial entity against character properties set in the following order
1. Check for __identity__
    1. check _without_ the title
        1. common name
        2. short name
        3. full name
        4. aliases
    2. check _with_ the title + name
        1. common name
        2. short name
        3. full name
        4. aliases
2. Check for __partial__ inclusion
    1. check _without_ the title
        1. common name
        2. short name
        3. full name
        4. aliases
    2. check _with_ the title + name
        1. common name
        2. short name
        3. full name
        4. aliases

If two candidate appear at the same level, we use the maximal distance scoring to determine the most valuable character

In [12]:
def getMVP(name, shortened, key, equal=True, title=False):
    candidates = []
        
    for s in shortened:
        if key != 'aliases':
            if equal and s[key] is not None:
                if not title:
                    if name == s[key]:
                        candidates.append(s)
                elif s['titles'] is not None:
                    for t in s['titles']:
                        if name == t + ' ' + s[key]:
                            candidates.append(s)
                        
            elif not equal and s[key] is not None:
                if not title:
                    if name in s[key]:
                        candidates.append(s)
                elif s['titles'] is not None:
                    for t in s['titles']:
                        if name in t + ' ' + s[key]:
                            candidates.append(s)
        else:
            if equal and s['aliases'] is not None:
                for a in s['aliases']:
                    if name == a:
                        candidates.append(s)
            elif not equal and s['aliases'] is not None:
                for a in s['aliases']:
                    if name in a:
                        candidates.append(s)
    # at equal level, we give priority to lenghty wikipedia (normalize) * links (normalize)
    if len(candidates)>0:
        mvp = sorted(candidates, key=lambda c: c['score'], reverse=True)[0]
        return mvp['common_name']
    else:
        None

def lemmatize(name, book):
    shortened = []
    for c in characters:
        if c['books'] is not None:
            if book in c['books']:
                shortened.append(c)
    
    priority_fields = ['common_name', 'short_name', 'fullname', 'aliases']
    priority_equality = [True, False]
    priority_title = [False, True]
    
    for pf in priority_fields:
        for pe in priority_equality:
            for pt in priority_title:
                candidate = getMVP(name, shortened, pf, pe, pt)
                if candidate is not None:
                    return candidate
    return None
    

In [13]:
print(lemmatize('Ser Jorah',1))
print(lemmatize('Ned',1))
# Edric Dayne
# Eddard Stark

Jorah Mormont
Eddard Stark


## Prepare the graph using lemmas mentionned in the same chapter
The threshold is the max allowed distance between two lemmas in the text. Typically 500 characters. 

In [14]:
import itertools

def addCharactersToGraph(characters, graph, threshold):
    graph.add_nodes_from(characters.keys())
    pairs = itertools.combinations(characters.keys(),2) 
    for u,v in pairs:
        weight = 0
        for ui in characters[u]['positions']:
            for vi in characters[v]['positions']:
                if abs(vi-ui) < threshold:
                    weight+=1
        if weight > 0:
            if (u,v) not in graph.edges():
                graph.add_edge(u,v,weight=weight)
            else:
                graph[u][v]['weight'] = graph[u][v]['weight'] + weight
            
    return graph

## For each Chapter, we analyze the entities and prepare the sentences
Each chapter contains some Persons, that might be characters. We save them and their position together with the book they belong to. 
This will allow us to 
* get a distance between persons
* disambiguate the characters using the book they appear in to

In [15]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import networkx as nx

STOPLIST = set(["n't", "'s", "'m", "ca"] + list(STOP_WORDS))
#SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", '\t','\n']
nlp = spacy.load('en_core_web_sm')

sentences = []
G = nx.Graph()

for i in range(len(chapters)):
    #if i > 2:
        #break
        
    chapterCharacters = {}
    book, chapter = chapters[i]
    chapter = chapter.strip().replace("\n", " ").replace("\t", " ")
    doc = nlp(chapter)

    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entity = ent.text.replace("'s",'')
            if entity.startswith('Maester'):
                entity = entity.replace('Maester ','')
            lemma = lemmatize(entity,book)
            #print(entity,'->',lemma)
            
            if lemma is not None:
                if lemma not in chapterCharacters.keys():
                    chapterCharacters[lemma] = {'books':set(),'positions':[]}
                chapterCharacters[lemma]['books'].add(book)
                chapterCharacters[lemma]['positions'].append(ent.start)
            ent.merge(ent.root.tag_,lemma if lemma is not None else ent.text, ent.label_)
    
    sentence = []
    for token in doc:
       #print(token.ent_type_, token.text, token.pos_)
        if token.ent_type_ == 'PERSON':
            text = token.text.replace(' ', '_').replace("'s",'')
            tag = token.ent_type_
            sentence.append('%s|%s' % (text, tag))
        elif token.pos_ not in ['PUNCT','SPACE']:
            sentence.append(token.lemma_.strip() if token.lemma_ != "-PRON-" else token.lower_)
        elif token.pos_ == 'PUNCT' and token.text == '.':
            #sentences.append(sentence)
            sentences.append([word for word in sentence if word not in STOPLIST])
            sentence = []
    
    #print(chapterCharacters)
    G = addCharactersToGraph(chapterCharacters,G,500)
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 1211
Number of edges: 27720
Average degree:  45.7803


In [16]:
nx.write_gexf(G,'GOT-characters-NLP.gexf')

## create a gensim word2vec model from GOT books

In [17]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=50, window=5, min_count=1, workers=4)
model.wv.most_similar('Eddard_Stark|PERSON')

  if np.issubdtype(vec.dtype, np.int):


[('Tywin_Lannister|PERSON', 0.9669781923294067),
 ('Arnolf|PERSON', 0.9654096364974976),
 ('Hoster_Tully|PERSON', 0.9652721285820007),
 ('Brandon|PERSON', 0.9609426259994507),
 ('protector', 0.9608691334724426),
 ('brightwater', 0.9599543809890747),
 ('Vale|PERSON', 0.9597084522247314),
 ('grandfather', 0.9590954184532166),
 ('foster', 0.9562659859657288),
 ('liege', 0.9560325145721436)]