In [1]:
import pandas as pd
import numpy as np
import json
from datetime import datetime
from itertools import combinations
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
data_characters = '/Users/genie/data/game_of_thrones/characters.json'
data_episodes = '/Users/genie/data/game_of_thrones/episodes.json'
data_characters_gender = '/Users/genie/data/game_of_thrones/characters-gender.json'
data_characters_groups = '/Users/genie/data/game_of_thrones/characters-groups.json'
data_characters_houses = '/Users/genie/data/game_of_thrones/characters-houses.json'
data_locations = '/Users/genie/data/game_of_thrones/locations.json'

In [3]:
# load data

with open(data_characters,'r') as read_file:
    characters_json = json.load(read_file)

with open(data_characters_gender,'r') as read_file:
    characters_gender_json = json.load(read_file)

with open(data_characters_groups,'r') as read_file:
    characters_groups_json = json.load(read_file)

with open(data_characters_houses,'r') as read_file:
    characters_houses_json = json.load(read_file)
    
with open(data_episodes,'r') as read_file:
    episodes_json = json.load(read_file)


In [4]:
# global maps

temp = list()
for i in range(len(characters_gender_json['gender'])):
    gender = characters_gender_json['gender'][i]
    if(gender['gender']=='male'):
        for character in gender['characters']:
            temp.append((character,'male'))
    elif(gender['gender']=='female'):
        for character in gender['characters']:
            temp.append((character,'female'))
gender_map = dict(temp)
del temp
    
OG = nx.Graph()    
N_characters = len(characters_json['characters'])

temp = list()
for i in range(N_characters):
    character = characters_json['characters'][i]
    characterName = character['characterName']
    temp.append((characterName,0))
dead_map = dict(temp)
del temp

temp = list()
for i in range(N_characters):
    character = characters_json['characters'][i]
    characterName = character['characterName']
    temp.append((characterName,np.nan))
mannerOfDeath_map = dict(temp)
del temp

temp = list()
for i in range(N_characters):
    character = characters_json['characters'][i]
    characterName = character['characterName']
    temp.append((characterName,np.nan))
killedBy_map = dict(temp)
del temp

temp = list()
for i in range(N_characters):
    character = characters_json['characters'][i]
    characterName = character['characterName']
    temp.append((characterName,np.nan))
deadInSeason_map = dict(temp)
del temp

temp = list()
for i in range(N_characters):
    character = characters_json['characters'][i]
    characterName = character['characterName']
    temp.append((characterName,np.nan))
deadInEpisode_map = dict(temp)
del temp

temp=list()
for i in range(1,8):
    temp.append(('s'+str(i), 0.0))
seasonsTotalTime_map = dict(temp)
del temp

In [5]:
## functions

def derive_scene_time(t1,t2):
    fmt = '%H:%M:%S'
    d2 = datetime.strptime(t2, fmt)
    d1 = datetime.strptime(t1, fmt)
    diff = d2-d1
    diff_mins = diff.seconds/60
    return(round(diff_mins, 2))

In [6]:
# load characters into nodes only network

N_characters = len(characters_json['characters'])
OG = nx.Graph()

temp = list()
for i in range(N_characters):
    character = characters_json['characters'][i]
    characterName = character['characterName']
    gender = gender_map.get(characterName, None)
    houseName = character.get('houseName', None)
    if(type(houseName).__name__=='list'):
        houseName = ';'.join(houseName)
    spouse = character.get('marriedEngaged', None)
    if(type(spouse).__name__=='list'):
        spouse = ';'.join(spouse)
    parents = character.get('parents', None)
    if(type(parents).__name__=='list'):
        parents = ';'.join(parents)
    siblings = character.get('siblings', None)
    if(type(siblings).__name__=='list'):
        siblings = ';'.join(siblings)
    parentOf = character.get('parentOf', None)
    if(type(parentOf).__name__=='list'):
        parentOf = ';'.join(parentOf)
    isRoyal = 1 if character.get('royal', False)==True else 0
    isKingsguard = 1 if character.get('kingsguard', False)==True else 0
    
#     killedPeople = character.get('killed', np.nan)
#     killedBy = character.get('killedBy', np.nan)
    
    if(OG.has_node(characterName)==False):
        OG.add_node(characterName) 
        temp.append((characterName,gender,houseName,spouse,parents,siblings,parentOf,isRoyal,isKingsguard))

cols = ['character_name','gender','house','spouse','parents','siblings','parent_of','royal','kingsguard']
character_df = pd.DataFrame.from_records(temp, columns=cols)

In [7]:
def parse_season_to_df(seasonNum, episode_counter):
    season_prefix = 's' + str(seasonNum) + "_"
    seasonTotalTimeInMins = 0.0
    
    G = nx.Graph(OG)
    
    for n in G.nodes():
        G.nodes[n][season_prefix + 'episodes'] = []
        G.nodes[n][season_prefix + 'numKilled'] = 0
        G.nodes[n][season_prefix + 'screenTime'] = 0.0
        
    season = [x for x in episodes_json['episodes'] if x['seasonNum'] == seasonNum]
    for i in range(len(season)):
        episode = season[i]
        episode_id = 'S' + str(episode['seasonNum']) + 'E' + str(episode['episodeNum'])
        episode_counter += 1
        
        scenes = episode['scenes']
        for scene in scenes:
            sceneStart = scene['sceneStart']
            sceneEnd = scene['sceneEnd']
            sceneTime = derive_scene_time(sceneStart,sceneEnd)
            seasonTotalTimeInMins += sceneTime
            charactersInScene = [character['name'] for character in scene['characters']]
            
            if(len(charactersInScene)>0):
                for u,v in combinations(charactersInScene,2):
                    if(G.has_node(u)==False):
                        continue;
                    if(G.has_node(v)==False):
                        continue;

                    if G.has_edge(u,v):
                        G[u][v]['edgeScreenTime'] += sceneTime
                    else:
                        G.add_edge(u, v, edgeScreenTime=sceneTime)
            
            if(len(charactersInScene)>0):
                for character in charactersInScene:
                    if G.has_node(character):
                        G.nodes[character][season_prefix+'screenTime'] += sceneTime
                    
            for character in scene['characters']:
                if(G.has_node(character['name'])==True):
                    episodes = G.nodes[character['name']][season_prefix+'episodes']
                    episodes.append(episode_id)
                    G.nodes[character['name']][season_prefix + 'episodes'] = list(set(episodes))
                    
                    alive = character.get('alive', True)
                    if(alive==False):
                        dead_map[character['name']] = 1
                        deadInSeason_map[character['name']] = 's' + str(seasonNum)
                        deadInEpisode_map[character['name']] = int(episode_counter)
                        mannerOfDeath_map[character['name']]=character.get('mannerOfDeath',np.nan)
                        killedBy = character.get('killedBy', np.nan)
                        if(type(killedBy).__name__=='list'):
                            killedBy_map[character['name']] = ';'.join(killedBy)
                            if(len(killedBy)>0):
                                for x in killedBy:
                                    if(G.has_node(x)):
                                        G.nodes[x][season_prefix + 'numKilled'] += 1
    
    seasonsTotalTime_map['s' + str(seasonNum)] = seasonTotalTimeInMins
    
    for n in G.nodes():
        G.nodes[n][season_prefix+'numOfCharactersInteractedWith'] = G.degree()[n]
#         G.nodes[n][season_prefix+'screenTime'] = sum(G[u][v]['screenTime'] for u, v in G.edges(n))
        G.nodes[n][season_prefix+'episodes'] = len(G.nodes[n][season_prefix+'episodes'])
    
    # calculate centralities
    bc = nx.betweenness_centrality(G, weight='edgeScreenTime')
    nx.set_node_attributes(G, bc, season_prefix+'bc')

    ec = nx.eigenvector_centrality_numpy(G,weight='edgeScreenTime')
    nx.set_node_attributes(G, ec, season_prefix+'ec')

#     pr = nx.pagerank_numpy(G, alpha=0.9, weight='edgeScreenTime')
#     nx.set_node_attributes(G, pr, season_prefix+'pagerank')

#     h,a=nx.hits_numpy(G)
#     nx.set_node_attributes(G, h, season_prefix+'hub')
#     nx.set_node_attributes(G, a, season_prefix+'authority')
    
    temp = list()
    for n in G.nodes():
        temp.append((n,G.nodes[n][season_prefix+'episodes'],round(G.nodes[n][season_prefix+'screenTime'],2),
                     G.nodes[n][season_prefix+'numOfCharactersInteractedWith'],G.nodes[n][season_prefix+'numKilled'],
                     round(G.nodes[n][season_prefix+'bc'],4),round(G.nodes[n][season_prefix+'ec'],4)))
    cols = ['character_name',season_prefix+'episodes',season_prefix+'screenTime',
            season_prefix+'numOfCharactersInteractedWith',season_prefix+'numKilled',
            season_prefix+'bc',season_prefix+'ec']    
    temp_df = pd.DataFrame.from_records(temp,columns=cols)
    temp_df[season_prefix+'shareOfScreenTime'] = temp_df.apply(lambda x: round((x[season_prefix+'screenTime']/seasonTotalTimeInMins)*100, 2), axis=1)
    return(temp_df, episode_counter)

In [8]:
episode_counter = 0
for i in range(1,8):
    temp_df, episode_counter_ = parse_season_to_df(i, episode_counter)
    episode_counter = episode_counter_
    character_df = pd.merge(character_df, temp_df, how='left', on=['character_name'])

In [9]:
dead_df = pd.DataFrame(dead_map.items(), columns=['character_name', 'is_dead'])
mannerOfDeath_df = pd.DataFrame(mannerOfDeath_map.items(), columns=['character_name', 'manner_of_death'])
killedBy_df = pd.DataFrame(killedBy_map.items(), columns=['character_name', 'killed_by'])
deadInSeason_df = pd.DataFrame(deadInSeason_map.items(), columns=['character_name', 'dead_in_season'])
deadInEpisode_df = pd.DataFrame(deadInEpisode_map.items(), columns=['character_name','duration_in_episodes'])
seasonTotalTimeInMins_df = pd.DataFrame(seasonsTotalTime_map.items(), columns=['season_num','total_time_in_mins'])

character_df = pd.merge(character_df, dead_df, how='left', on=['character_name'])
character_df = pd.merge(character_df, mannerOfDeath_df, how='left', on=['character_name'])
character_df = pd.merge(character_df, killedBy_df, how='left', on=['character_name'])
character_df = pd.merge(character_df, deadInSeason_df, how='left', on=['character_name'])
character_df = pd.merge(character_df, deadInEpisode_df, how='left', on=['character_name'])

character_df['royal'] =  character_df.apply(lambda x: 1 if x['royal']==True else 0, axis=1)
character_df['kingsguard'] =  character_df.apply(lambda x: 1 if x['kingsguard']==True else 0, axis=1)
character_df['duration_in_episodes'] = character_df.apply(
    lambda x: episode_counter if np.isnan(x['duration_in_episodes']) else int(round(x['duration_in_episodes'],0)), axis=1)

del dead_map
del mannerOfDeath_map
del killedBy_map
del deadInSeason_map
del deadInEpisode_map
del seasonsTotalTime_map
del dead_df
del mannerOfDeath_df
del killedBy_df
del deadInSeason_df
del deadInEpisode_df

del seasonTotalTimeInMins_df

In [10]:
character_df.to_csv('/Users/genie/data/game_of_thrones/got_characters_s1_to_s7.csv',index=False)

In [11]:
# from collections import Counter
# words=["i", "love", "you", "i", "you", "a", "are", "you", "you", "fine", "green"]
# most_common_words= [word for word, word_count in Counter(words).most_common(3)]
# print most_common_words

# x1 = list(df2.columns)
# x2 = list(df.columns)
# # list(set(x2) - set(x1))
# print(x2)

In [12]:
# G = nx.Graph(OG)

# for n in G.nodes():
#     G.nodes[n]['episodes']=[]
#     G.nodes[n]['numKilled']=0
        
# season = [x for x in episodes_json['episodes'] if x['seasonNum'] == 1]
# for i in range(len(season)):
#     episode = season[i]
#     seasonNum = episode['seasonNum']
#     episodeNum = episode['episodeNum']
#     scenes = episode['scenes']
#     for scene in scenes:
#         sceneStart = scene['sceneStart']
#         sceneEnd = scene['sceneEnd']
#         sceneTime = derive_scene_time(sceneStart,sceneEnd)
#         location = scene.get('location', np.nan)
#         subLocation = scene.get('subLocation', np.nan)
#         charactersInScene = [character['name'] for character in scene['characters']]
        
#         if(len(charactersInScene)>0):
#             for u,v in combinations(charactersInScene,2):
#                 if(G.has_node(u)==False):
#                     continue;
#                 if(G.has_node(v)==False):
#                     continue;
                
#                 if G.has_edge(u,v):
#                     G[u][v]['screenTime'] += sceneTime
#                 else:
#                     G.add_edge(u, v, screenTime=sceneTime)
        
#         for character in scene['characters']:
#             if(G.has_node(character['name'])==True):
#                 episodes = G.nodes[character['name']]['episodes']
#                 episodes.append('S'+str(seasonNum)+'-'+'E'+str(episodeNum))
#                 G.nodes[character['name']]['episodes'] = list(set(episodes))
#                 alive = character.get('alive', True)
#                 if(alive==False):
#                     dead_map[character['name']] = 1
#                     dead_type_map[character['name']]=character.get('mannerOfDeath',np.nan)
#                     killedBy = character.get('killedBy', np.nan)
#                     killed_by_map[character['name']] = killedBy
#                     if(pd.notna(killedBy)==True):
#                         for x in killedBy:
#                             if(G.has_node(x)):
#                                 G.nodes[x]['numKilled'] += 1

# for n in G.nodes():
#     G.nodes[n]['numOfCharactersInteractedWith'] = G.degree()[n]
#     G.nodes[n]['screenTime'] = sum(G[u][v]['screenTime'] for u, v in G.edges(n))
#     G.nodes[n]['episodes'] = len(G.nodes[n]['episodes'])
    
# # calculate centralities
# bc = nx.betweenness_centrality(G)
# nx.set_node_attributes(G, bc, 'bc')

# ec = nx.eigenvector_centrality(G)
# nx.set_node_attributes(G, ec, 'ec')

# pr = nx.pagerank(G, alpha=0.9)
# nx.set_node_attributes(G, pr, 'pagerank')

# h,a=nx.hits(G)
# nx.set_node_attributes(G, h, 'hub')
# nx.set_node_attributes(G, a, 'authority')

# temp = list()
# for n in G.nodes():
#     temp.append((n,G.nodes[n]['gender'],G.nodes[n]['house'],G.nodes[n]['spouse'],G.nodes[n]['parents'],
#                 G.nodes[n]['siblings'],G.nodes[n]['parentOf'],G.nodes[n]['royal'],G.nodes[n]['kingsguard'],
#                 G.nodes[n]['numKilled'],G.nodes[n]['episodes'],dead_map.get(n),killed_by_map.get(n),dead_type_map.get(n),
#                 G.nodes[n]['numOfCharactersInteractedWith'],G.nodes[n]['screenTime'],G.nodes[n]['bc'],
#                 G.nodes[n]['ec'],G.nodes[n]['pagerank'],G.nodes[n]['hub'],G.nodes[n]['authority']))
    
# seasonName = 's2'
# df2 = pd.DataFrame.from_records(temp, columns=['character_name','gender','house','spouse','parents','siblings','children','royal',
#                                   'kingsguard',seasonName+'_num_killed',seasonName+'_episodes','alive','killed_by','manner_of_death', seasonName + '_num_characters_interacted',
#                                    seasonName+'_screen_time',seasonName+'_bc',seasonName+'_ec',seasonName+'_pagerank',
#                                    seasonName + '_hub',seasonName + '_authority'])

In [13]:
# # build scenes data

# def derive_scene_time_2(x):
#     fmt = '%H:%M:%S'
#     d2 = datetime.strptime(x['scene_end'], fmt)
#     d1 = datetime.strptime(x['scene_start'], fmt)
#     diff = d2-d1
#     diff_mins = diff.seconds/60
#     return(round(diff_mins, 2))


# N_episodes = len(episodes_json['episodes'])
# temp = list()
# for i in range(N_episodes):
#     curr_episode = episodes_json['episodes'][i]
#     seasonNum = curr_episode['seasonNum']
#     episodeNum = curr_episode['episodeNum']
#     episodeCounter = i+1
#     scenes = curr_episode['scenes']
    
#     for scene in scenes:
#         sceneStart = scene['sceneStart']
#         sceneEnd = scene['sceneEnd']
#         location = scene.get('location', np.nan)
#         subLocation = scene.get('subLocation', np.nan)
#         charactersInScene = [character['name'] for character in scene['characters']]
        
#         temp.append((seasonNum,episodeNum,episodeCounter,sceneStart,sceneEnd,location,subLocation,
#                      ','.join(charactersInScene) if len(charactersInScene)>0 else ''))

# scenes_df = pd.DataFrame.from_records(temp, columns = ['season_num','episode_num','episode_counter','scene_start','scene_end',
#                                                        'location','sub_location','characters_in_scene'])
# scenes_df['screen_time'] = scenes_df.apply(derive_scene_time_2, axis=1)

In [14]:
# scenes_df.head()

In [15]:
# mins = sum(scenes_df.loc[i]['screen_time'] 
#            for i in range(len(scenes_df)) if 'Cersei Lannister' in scenes_df.loc[i]['characters_in_scene'])
# round(mins/60, 2)

In [16]:
# scenes_df[(scenes_df.characters_killed_in_scene.notnull()==True) & (len(scenes_df.characters_killed_in_scene)>2)].head()

In [17]:
# scenes_df.info()

In [18]:
# scenes_df['episode_counter'].max()

# # screen time of characters
# mins = sum(scenes_df.loc[i]['screen_time'] for i in range(len(scenes_df)) if 'Jon Snow' in scenes_df.loc[i]['characters_in_scene'])
# round(mins/60, 2)

In [19]:
# 'Will, Wight Wildling Girl'.split(', ')

In [20]:
# 'Gared,Waymar Royce,Will'.split(',')

In [21]:
# for u, v in combinations('Gared,Waymar Royce,Will,Eddard'.split(','), 2):
#     print(u, v)