# GAME OF THRONES: A GRAPH EXPERIMENT



### get the extracted characters informations from pickles

The dictionaries are loaded directly

In [1]:
import pickle
with open('characters.pickle','rb') as fp:
    characters = pickle.load(fp)

In [2]:
print(characters[5])

{'aliases': None, 'awoif_in_degree': 0, 'awoif_infobox_length': 4, 'awoif_links': ['Sandor_Clegane', 'Beric_Dondarrion', 'Anguy', 'Kyle', 'Thoros_of_Myr'], 'awoif_out_degree': 5, 'awoif_page_size': 22295, 'books': [3], 'category': 'appears', 'common_name': 'Dennet', 'fullname': None, 'score': 0.12814004744487947, 'short_name': 'Dennett', 'titles': None, 'url': 'Dennet'}


## Read the chapters and the book they belong to
The books have been cut into chapters themselves into a book directory named GOT{x} where x is the book in order of publication.
We read the whole directory to retain only text files. we create a chapter list containing the text of the chapter and the book index

In [3]:
import os
from os.path import join

chapters = []

for root, dirs, files in os.walk('data'):
    for name in files:
        if name.endswith('txt'):
            book = int(root.split('/')[-1][-1:])
            with open(join(root,name),'r') as fp:
                chapters.append((book,fp.read()))
                
print('found',len(chapters),'chapters')
#print(chapters[1])

found 345 chapters


## Lemmatize a person entity : Get Most Probable Character for an incomplete NLP person entity
Check partial entity against character properties set in the following order
1. Check for __identity__
    1. check _without_ the title
        1. common name
        2. short name
        3. full name
        4. aliases
    2. check _with_ the title + name
        1. common name
        2. short name
        3. full name
        4. aliases
2. Check for __partial__ inclusion
    1. check _without_ the title
        1. common name
        2. short name
        3. full name
        4. aliases
    2. check _with_ the title + name
        1. common name
        2. short name
        3. full name
        4. aliases

If two candidate appear at the same level, we use the maximal distance scoring to determine the most valuable character

In [5]:
def getMVP(name, shortened, key, equal=True, title=False):
    candidates = []
        
    for s in shortened:
        if key != 'aliases':
            if equal and s[key] is not None:
                if not title:
                    if name == s[key]:
                        candidates.append(s)
                elif s['titles'] is not None:
                    for t in s['titles']:
                        if name == t + ' ' + s[key]:
                            candidates.append(s)
                        
            elif not equal and s[key] is not None:
                if not title:
                    if name in s[key]:
                        candidates.append(s)
                elif s['titles'] is not None:
                    for t in s['titles']:
                        if name in t + ' ' + s[key]:
                            candidates.append(s)
        else:
            if equal and s['aliases'] is not None:
                for a in s['aliases']:
                    if name == a:
                        candidates.append(s)
            elif not equal and s['aliases'] is not None:
                for a in s['aliases']:
                    if name in a:
                        candidates.append(s)
    # at equal level, we give priority to lenghty wikipedia (normalize) * links (normalize)
    if len(candidates)>0:
        mvp = sorted(candidates, key=lambda c: c['score'], reverse=True)[0]
        return mvp['common_name']
    else:
        None

def lemmatize(name, book):
    shortened = []
    for c in characters:
        if c['books'] is not None:
            if book in c['books']:
                shortened.append(c)
    
    priority_fields = ['common_name', 'short_name', 'fullname', 'aliases']
    priority_equality = [True, False]
    priority_title = [False, True]
    
    for pf in priority_fields:
        for pe in priority_equality:
            for pt in priority_title:
                candidate = getMVP(name, shortened, pf, pe, pt)
                if candidate is not None:
                    return candidate
    return None
    

In [8]:
print(lemmatize('Ser Jorah',1))
print(lemmatize('Ned',1))
print(lemmatize('Brandon',1))
# Edric Dayne
# Eddard Stark

Jorah Mormont
Eddard Stark
Brandon Stark


## Prepare the graph using lemmas mentionned in the same chapter
The threshold is the max allowed distance between two lemmas in the text. Typically 500 characters. 

In [9]:
import itertools

def addCharactersToGraph(characters, graph, threshold):
    graph.add_nodes_from(characters.keys())
    pairs = itertools.combinations(characters.keys(),2) 
    for u,v in pairs:
        weight = 0
        for ui in characters[u]['positions']:
            for vi in characters[v]['positions']:
                if abs(vi-ui) < threshold:
                    weight+=1
        if weight > 0:
            if (u,v) not in graph.edges():
                graph.add_edge(u,v,weight=weight)
            else:
                graph[u][v]['weight'] = graph[u][v]['weight'] + weight
            
    return graph

## For each Chapter, we analyze the entities and prepare the sentences
Each chapter contains some Persons, that might be characters. We save them and their position together with the book they belong to. 
This will allow us to 
* get a distance between persons
* disambiguate the characters using the book they appear in to

In [24]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import networkx as nx

STOPLIST = set(["n't", "'s", "'m", "ca"] + list(STOP_WORDS))
#SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", '\t','\n']
nlp = spacy.load('en_core_web_sm')

sentences = []
G = nx.Graph()

for i in range(len(chapters)):
    #if i > 2:
        #break
        
    chapterCharacters = {}
    book, chapter = chapters[i]
    chapter = chapter.strip().replace("\n", " ").replace("\t", " ")
    doc = nlp(chapter)

    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entity = ent.text.replace("'s",'')
            if entity.startswith('Maester'):
                entity = entity.replace('Maester ','')
            lemma = lemmatize(entity,book)
            #print(entity,'->',lemma)
            
            if lemma is not None:
                if lemma not in chapterCharacters.keys():
                    chapterCharacters[lemma] = {'books':set(),'positions':[]}
                chapterCharacters[lemma]['books'].add(book)
                chapterCharacters[lemma]['positions'].append(ent.start)
            ent.merge(ent.root.tag_,lemma if lemma is not None else ent.text, ent.label_)
    
    sentence = []
    for token in doc:
        print(token.ent_type_, token.text, token.pos_)
        if token.ent_type_ == 'PERSON':
            text = token.text.replace(' ', '_').replace("'s",'')
            tag = token.ent_type_
            sentence.append('%s|%s' % (text, tag))
        elif token.pos_ not in ['PUNCT','SPACE']:
            sentence.append(token.lemma_.strip() if token.lemma_ != "-PRON-" else token.lower_)
        elif token.pos_ == 'PUNCT' and token.text == '.':
            #sentences.append(sentence)
            sentences.append([word for word in sentence if word not in STOPLIST])
            sentence = []
    
    #print(chapterCharacters)
    G = addCharactersToGraph(chapterCharacters,G,500)
print(nx.info(G))

 The DET
 visitors NOUN
 poured VERB
 through ADP
 the DET
 castle ADJ
 gates NOUN
 in ADP
 a DET
 river NOUN
 of ADP
 gold NOUN
 and CCONJ
 silver NOUN
 and CCONJ
 polished ADJ
 steel NOUN
 , PUNCT
CARDINAL three NUM
CARDINAL hundred NUM
 strong ADJ
 , PUNCT
 a DET
 pride NOUN
 of ADP
 bannermen NOUN
 and CCONJ
 knights NOUN
 , PUNCT
 of ADP
 sworn VERB
 swords NOUN
 and CCONJ
 freeriders NOUN
 . PUNCT
 Over ADP
 their ADJ
 heads NOUN
 a DET
 dozen NOUN
 golden ADJ
 banners NOUN
 whipped VERB
 back ADV
 and CCONJ
 forth ADV
 in ADP
 the DET
 northern ADJ
 wind NOUN
 , PUNCT
 emblazoned VERB
 with ADP
 the DET
 crowned ADJ
 stag NOUN
 of ADP
PRODUCT Baratheon PROPN
 . PUNCT
   SPACE
PERSON Ned PROPN
 knew VERB
 many ADJ
 of ADP
 the DET
 riders NOUN
 . PUNCT
 There ADV
 came VERB
ORG Ser PROPN
ORG Jaime PROPN
ORG Lannister PROPN
 with ADP
 hair NOUN
 as ADV
 bright ADJ
 as ADP
 beaten VERB
 gold NOUN
 , PUNCT
 and CCONJ
 there ADV
PERSON Sandor Clegane PROPN
 with ADP
 his ADJ
 terribl

 around ADV
 in ADP
 these DET
 short ADJ
 gowns NOUN
 , PUNCT
 silk VERB
 if ADP
 they PRON
 have VERB
 the DET
 silver NOUN
 and CCONJ
 cotton NOUN
 if ADP
 not ADV
 , PUNCT
 but CCONJ
 it PRON
 's VERB
 all ADJ
 the DET
 same ADJ
 when ADV
 they PRON
 start VERB
 sweating VERB
 and CCONJ
 the DET
 cloth NOUN
 sticks VERB
 to ADP
 their ADJ
 skin NOUN
 , PUNCT
 they PRON
 might VERB
 as ADV
 well ADV
 be VERB
 naked ADJ
 . PUNCT
 " PUNCT
 The DET
 king NOUN
 laughed VERB
 happily ADV
 . PUNCT
   SPACE
PERSON Robert Baratheon PROPN
 had VERB
 always ADV
 been VERB
 a DET
 man NOUN
 of ADP
 huge ADJ
 appetites NOUN
 , PUNCT
 a DET
 man NOUN
 who NOUN
 knew VERB
 how ADV
 to PART
 take VERB
 his ADJ
 pleasures NOUN
 . PUNCT
 That DET
 was VERB
 not ADV
 a DET
 charge NOUN
 anyone NOUN
 could VERB
 lay VERB
 at ADP
 the DET
 door NOUN
 of ADP
PERSON Eddard Stark PROPN
 . PUNCT
 Yet CCONJ
PERSON Ned PROPN
 could VERB
 not ADV
 help VERB
 but CCONJ
 notice VERB
 that ADP
 those DET
 pleasu

 and CCONJ
 Father PROPN
 . PUNCT
 " PUNCT
 He PRON
 could VERB
 hear VERB
 her PRON
 still ADV
 at ADP
 times NOUN
 . PUNCT
 Promise VERB
 me PRON
 , PUNCT
 she PRON
 had VERB
 cried VERB
 , PUNCT
 in ADP
 a DET
 room NOUN
 that ADJ
 smelled VERB
 of ADP
 blood NOUN
 and CCONJ
 roses NOUN
 . PUNCT
 Promise VERB
 me PRON
 , PUNCT
PERSON Ned PROPN
 . PUNCT
 The DET
 fever NOUN
 had VERB
 taken VERB
 her ADJ
 strength NOUN
 and CCONJ
 her ADJ
 voice NOUN
 had VERB
 been VERB
 faint ADJ
 as ADP
 a DET
 whisper NOUN
 , PUNCT
 but CCONJ
 when ADV
 he PRON
 gave VERB
 her PRON
 his ADJ
 word NOUN
 , PUNCT
 the DET
 fear NOUN
 had VERB
 gone VERB
 out ADP
 of ADP
 his ADJ
 sister NOUN
 's PART
 eyes NOUN
 . PUNCT
PERSON Ned PROPN
 remembered VERB
 the DET
 way NOUN
 she PRON
 had VERB
 smiled VERB
 then ADV
 , PUNCT
 how ADV
 tightly ADV
 her ADJ
 fingers NOUN
 had VERB
 clutched VERB
 his ADJ
 as ADP
 she PRON
 gave VERB
 up PART
 her ADJ
 hold NOUN
 on ADP
 life NOUN
 , PUNCT
 the DET
 rose

KeyboardInterrupt: 

In [11]:
nx.write_gexf(G,'GOT-characters-NLP.gexf')

## create a gensim word2vec model from GOT books

In [17]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, size=300, window=10, min_count=1, workers=4)
model.wv.most_similar('Eddard_Stark|PERSON')

[('Eddard|PERSON', 0.9777810573577881),
 ('Hoster_Tully|PERSON', 0.9772862792015076),
 ('Arnolf|PERSON', 0.9724704027175903),
 ('Lady_Lysa_Arryn|PERSON', 0.972131073474884),
 ('foster', 0.9710407257080078),
 ('Harrold|PERSON', 0.967271089553833),
 ('rethink', 0.9664437770843506),
 ('Rickard|PERSON', 0.9651771783828735),
 ('Ned_Stark|PERSON', 0.9646574854850769),
 ('murder', 0.9640084505081177)]

In [23]:
G = nx.Graph()
for word in model.wv.vocab:
    if word.endswith('|PERSON'):
        G.add_node(word[:-7])
        for word2,score in model.wv.most_similar(word):
            if word2.endswith('|PERSON') and score > 0.5:           
                G.add_edge(word,word2[:-7])
nx.write_gexf(G,'GOT-characters-NLP-gensim.gexf')

In [22]:
G = nx.Graph()