In [119]:
import pandas as pd 
import nltk
from pathlib import Path
import re
import string
import requests
from nltk import sent_tokenize
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
from afinn import Afinn
import numpy as np
import networkx as nx

sns.set_style("darkgrid")

In [3]:
def clean_text(text_file):    
    '''
    Reads in text file from GITHUB gets rid of newline and special characters
    '''
    
    url = 'https://raw.githubusercontent.com/justinherman42/Data620/master/GOT_Final_project/textfiles/'+text_file
    page = requests.get(url)
    text=page.text
    data = text.replace('\r', ' ').replace('\n', ' ').replace("\'", "'")   
    novel=''
    novel += ' ' + data
    novel = novel.replace('. . .',';')
    return novel

In [4]:
## build out names for text file URL's
git_text_file_names=['got1.txt','got2.txt','got3.txt','got4.txt','got5.txt']

In [5]:
## Built out dictionary with untokenized data to use with the count vectorizer
## All I really saw in terms of issues was ... being frequently used which I substituted with ;

book_series_untokenized={}
for i,x in enumerate(git_text_file_names):
    book_series_untokenized["book{}".format(i+1)]=clean_text(x)
    
## built out dictionary with tokenized data
book_series={}
for i,x in enumerate(git_text_file_names):
    orig_text=clean_text(x)
    sentence_list = sent_tokenize(orig_text)
    book_series["book{}".format(i+1)]= sentence_list

In [10]:
type(book_series["book1"])

list

In [16]:
book_series.keys()

dict_keys(['book1', 'book2', 'book3', 'book4', 'book5'])

### Name Entity Recognition

In [17]:
#Instantiate NLP for English
nlp = spacy.load('en')

In [18]:
def name_entity_recognition(sentence):
    '''
    A function to retrieve name entities in a sentence.
    :param sentence: the sentence to retrieve names from.
    :return: a name entity list of the sentence.
    '''

    doc = nlp(sentence)
    # retrieve person and organization's name from the sentence
    name_entity = [x for x in doc.ents if x.label_ in ['PERSON']]
    # convert all names to lowercase and remove 's and ’s in names
    name_entity = [str(x).lower().replace("'s","") for x in name_entity]
    name_entity = [x.replace("’s","") for x in name_entity]
    # remove name words that are less than 3 letters to raise recognition accuracy
    name_entity = [x for x in name_entity if len(x) >= 3]

    return name_entity

In [19]:
def flatten(l):
    """A function that flattens a complex list"""
    flat_list = []
    for i in l:
        for j in i:
            flat_list.append(j)
    return flat_list

In [20]:
def nlist(book):
    """Returns a unique list of names from a sentence tokenized book"""
    names = []
    for i in book:
        if name_entity_recognition(i) != []:
            names.append(name_entity_recognition(i))
    names = list(set(flatten(names)))
    return names

In [52]:
names_book1 = nlist(book_series["book1"])
names_book2 = nlist(book_series["book2"])
names_book3 = nlist(book_series["book3"])
names_book4 = nlist(book_series["book4"])
names_book5 = nlist(book_series["book5"])

In [53]:
len(names_book1)

761

In [54]:
names_book1

['dan ares',
 'instruct lord manderly',
 'folly',
 'ser danwell',
 'slowly ned',
 'martyn rivers',
 'rickon stark',
 'renly can’t',
 'stiv',
 'queen naerys',
 'merrett',
 'balon greyjoy',
 'splendid',
 'aegon the fortunate',
 '’m brandon stark',
 'othell yarwyck',
 'benjen',
 'ser gerold hightower',
 'ser preston greenfield',
 'lady tarly',
 'groggy',
 'hal mollen',
 'jon snow',
 'ser willem darry',
 'barristan the bold',
 'jafer flowers',
 'queer',
 'robett',
 'blackwood',
 'ser rodrik',
 'lady sansa',
 'ser waymar royce',
 'ser addam',
 'tyroshi',
 'kiss',
 'ser jaime',
 'ser arys oakheart',
 'bread',
 'ser wylis',
 'khal ogo',
 'riverrun',
 'maestor aemon',
 'king landing',
 'lord tyrell',
 'robin flint',
 'prince viserys',
 'jaime',
 'ser karyl',
 'jacks',
 'quaro',
 'catelyn stark',
 'dragonstone',
 'ser ilyn',
 'chett',
 'daenerys stormborn',
 'ronel rivers',
 'cersei',
 'fat tom',
 'matt',
 'lord jon',
 'aggo',
 'eddard',
 'ser aron santagar',
 'khal jhaqo',
 'walda',
 'sweet sa

In [58]:
len(names_book5)

1452

In [59]:
all_names_list = [names_book1,names_book2,names_book3,names_book4,names_book5]

In [60]:
len(all_names_list)

5

In [61]:
# As shown below, for some unknown reason SpaCy NER was unable to detect Tyrion as a name entity
# in a few of the books
for i in all_names_list:
    print('tyrion' in i)

# We will manually enter Tyrion in for books 1, 2, and 4
names_book1.append('tyrion')
names_book2.append('tyrion')
names_book4.append('tyrion')

False
False
True
False
True


In [62]:
""" We'll load certain words that SpaCy NER pulled as names that are not names to extricate them 
from the list. Game of Thrones names are tricky and it's helpful that some members of our group have 
read the books to be able to recognize some things as not names. For example, Pie is a character, but 
could easily be mistaken as the food.
"""
words = ['hand','father','maester','gods','winterfell','wine','uncle','sleep','broken',
        'wall','battle','watch','sweet','killed','north','child','kill','harrenhal','rock',
         'dothraki','dead','ser','blood','knew','sword','queen','wolf','words','horse','hands',
         'fingers','left','wildlings','daughter','word','fat','bastard','westeros','light',
         'yunkai','aye','meereen','knight','woman','gate',
         'lady', #removing lady. Although Lady is Sansa's wolf, Lady dies early and the NER algorithm
                 # continues to pick up lady due to it's use as in "Lady Stark", etc.
         'grace'] #removing grace since this person changes based on who sits on the iron throne

In [64]:
def top_names(name_list, novel, top_num=25):
    '''
    Returns name freq of a book for each name
    '''

    vect = CountVectorizer(vocabulary=name_list, stop_words='english')
    name_frequency = vect.fit_transform([novel.lower()])
    name_frequency = pd.DataFrame(name_frequency.toarray(), columns=vect.get_feature_names())
    name_frequency = name_frequency.T
    name_frequency = name_frequency.sort_values(by=0, ascending=False)
    name_frequency = name_frequency[0:top_num]
    names = list(name_frequency.index)
    name_frequency = list(name_frequency[0])

    return name_frequency, names

In [65]:
names1 = [x for x in names_book1 if x not in words]
names2 = [x for x in names_book2 if x not in words]
names3 = [x for x in names_book3 if x not in words]
names4 = [x for x in names_book4 if x not in words]
names5 = [x for x in names_book5 if x not in words]

def out_freq_df(name_freq):
    """Takes the items returned from top_names and returns a dataframe that will be fed into viz"""
    df = pd.DataFrame([name_freq[1],name_freq[0]]).transpose()
    df.columns = ['names','freq']
    return df

topnames1_df = out_freq_df(top_names(names1,book_series_untokenized['book1']))
topnames2_df = out_freq_df(top_names(names2,book_series_untokenized['book2']))
topnames3_df = out_freq_df(top_names(names3,book_series_untokenized['book3']))
topnames4_df = out_freq_df(top_names(names4,book_series_untokenized['book4']))
topnames5_df = out_freq_df(top_names(names5,book_series_untokenized['book5']))

In [66]:
topnames1_df

Unnamed: 0,names,freq
0,jon,834
1,ned,791
2,tyrion,619
3,bran,548
4,catelyn,497
5,arya,463
6,sansa,422
7,dany,413
8,robb,410
9,robert,403


In [67]:
topnames2_df

Unnamed: 0,names,freq
0,tyrion,681
1,jon,466
2,arya,444
3,bran,425
4,theon,414
5,stannis,379
6,sansa,302
7,renly,284
8,joffrey,282
9,catelyn,280


In [68]:
topnames3_df

Unnamed: 0,names,freq
0,jon,845
1,tyrion,651
2,jaime,576
3,sansa,508
4,arya,490
5,sam,393
6,robb,374
7,snow,340
8,dany,335
9,catelyn,302


In [84]:
def calculate_align_rate(sentence_list):
    '''
    Function to calculate the align_rate of the whole novel
    :param sentence_list: the list of sentence of the whole novel.
    :return: the align rate of the novel.
    '''
    afinn = Afinn()
    sentiment_score = [afinn.score(x) for x in sentence_list]
    align_rate = np.sum(sentiment_score)/len(np.nonzero(sentiment_score)[0]) * -2

    return align_rate


def calculate_matrix(name_list, sentence_list, align_rate):
    '''
    Function to calculate the co-occurrence matrix and sentiment matrix among all the top characters
    :param name_list: the list of names of the top characters in the novel.
    :param sentence_list: the list of sentences in the novel.
    :param align_rate: the sentiment alignment rate to align the sentiment score between characters due to the writing style of
    the author. Every co-occurrence will lead to an increase or decrease of one unit of align_rate.
    :return: the co-occurrence matrix and sentiment matrix.
    '''

    # calculate a sentiment score for each sentence in the novel
    afinn = Afinn()
    sentiment_score = [afinn.score(x) for x in sentence_list]
    # calculate occurrence matrix and sentiment matrix among the top characters
    name_vect = CountVectorizer(vocabulary=name_list, binary=True)
    occurrence_each_sentence = name_vect.fit_transform(sentence_list).toarray()
    cooccurrence_matrix = np.dot(occurrence_each_sentence.T, occurrence_each_sentence)
    sentiment_matrix = np.dot(occurrence_each_sentence.T, (occurrence_each_sentence.T * sentiment_score).T)
    sentiment_matrix += align_rate * cooccurrence_matrix
    cooccurrence_matrix = np.tril(cooccurrence_matrix)
    sentiment_matrix = np.tril(sentiment_matrix)
    # diagonals of the matrices are set to be 0 (co-occurrence of name itself is meaningless)
    shape = cooccurrence_matrix.shape[0]
    cooccurrence_matrix[[range(shape)], [range(shape)]] = 0
    sentiment_matrix[[range(shape)], [range(shape)]] = 0

    return cooccurrence_matrix, sentiment_matrix

In [85]:
align_rate1 = calculate_align_rate(book_series["book1"])

In [86]:
cooccurrence_matrix1, sentiment_matrix1 = calculate_matrix(list(topnames1_df.names), book_series["book1"], align_rate1)

In [87]:
cooccurrence_matrix1

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [22,  7,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [10, 19, 14, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 8, 12,  0,  8,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 2, 12,  0,  8,  4, 45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0],
       [21,  7,  1, 54, 12, 10,  7,  0,  0,  0,  0,  0,  0,  0, 

In [89]:
align_rate2 = calculate_align_rate(book_series["book2"])
cooccurrence_matrix2, sentiment_matrix2 = calculate_matrix(list(topnames2_df.names), book_series["book2"], align_rate1)

align_rate3 = calculate_align_rate(book_series["book3"])
cooccurrence_matrix3, sentiment_matrix3 = calculate_matrix(list(topnames3_df.names), book_series["book3"], align_rate1)

align_rate4 = calculate_align_rate(book_series["book4"])
cooccurrence_matrix4, sentiment_matrix4 = calculate_matrix(list(topnames4_df.names), book_series["book4"], align_rate1)

align_rate5 = calculate_align_rate(book_series["book5"])
cooccurrence_matrix5, sentiment_matrix5 = calculate_matrix(list(topnames5_df.names), book_series["book5"], align_rate1)

In [120]:
def matrix_to_edge_list(matrix, mode, name_list):
    '''
    Function to convert matrix (co-occurrence/sentiment) to edge list of the network graph. It determines the
    weight and color of the edges in the network graph.
    :param matrix: co-occurrence matrix or sentiment matrix.
    :param mode: 'co-occurrence' or 'sentiment'
    :param name_list: the list of names of the top characters in the novel.
    :return: the edge list with weight and color param.
    '''
    edge_list = []
    shape = matrix.shape[0]
    lower_tri_loc = list(zip(*np.where(np.triu(np.ones([shape, shape])) == 0)))
    normalized_matrix = matrix / np.max(np.abs(matrix))
    if mode == 'co-occurrence':
        weight = np.log(2000 * normalized_matrix + 1) * 0.7
        color = np.log(2000 * normalized_matrix + 1)
    if mode == 'sentiment':
        weight = np.log(np.abs(1000 * normalized_matrix) + 1) * 0.7
        color = 2000 * normalized_matrix
    for i in lower_tri_loc:
        edge_list.append((name_list[i[0]], name_list[i[1]], {'weight': weight[i], 'color': color[i]}))

    return edge_list

In [121]:
def plot_graph(name_list, name_frequency, matrix, plt_name, mode, path=''):
    '''
    Function to plot the network graph (co-occurrence network or sentiment network).
    :param name_list: the list of top character names in the novel.
    :param name_frequency: the list containing the frequencies of the top names.
    :param matrix: co-occurrence matrix or sentiment matrix.
    :param plt_name: the name of the plot (PNG file) to output.
    :param mode: 'co-occurrence' or 'sentiment'
    :param path: the path to output the PNG file.
    :return: a PNG file of the network graph.
    '''

    label = {i: i for i in name_list}
    edge_list = matrix_to_edge_list(matrix, mode, name_list)
    normalized_frequency = np.array(name_frequency) / np.max(name_frequency)

    plt.figure(figsize=(20, 20))
    G = nx.Graph()
    G.add_nodes_from(name_list)
    G.add_edges_from(edge_list)
    pos = nx.circular_layout(G)
    edges = G.edges()
    weights = [G[u][v]['weight'] for u, v in edges]
    colors = [G[u][v]['color'] for u, v in edges]

    if mode == 'co-occurrence':
        nx.draw(G, pos, node_color='#A0CBE2', node_size=np.sqrt(normalized_frequency) * 4000, edge_cmap=plt.cm.Blues,
                linewidths=10, font_size=35, labels=label, edge_color=colors, with_labels=True, width=weights)
    elif mode == 'sentiment':
        nx.draw(G, pos, node_color='#A0CBE2', node_size=np.sqrt(normalized_frequency) * 4000,
                linewidths=10, font_size=35, labels=label, edge_color=colors, with_labels=True,
                width=weights, edge_vmin=-1000, edge_vmax=1000)
    else:
        raise ValueError("mode should be either 'co-occurrence' or 'sentiment'")

    plt.savefig(path + plt_name + '.png')

In [114]:
topnames1_df.freq

0     834
1     791
2     619
3     548
4     497
5     463
6     422
7     413
8     410
9     403
10    395
11    345
12    228
13    219
14    189
15    177
16    167
17    157
18    154
19    151
20    146
21    141
22    139
23    126
24    123
Name: freq, dtype: object