## Star Wars 
https://linkedin.com/in/dennisbakhuis

## 5. Star Wars Network analysis

We collected a total of 5334 characters and wouldn't it be great to analyze the relations between each character. As a first attempt to work with graph networks, I want to visualize the network around characters. For this we need to have nodes, which are the characters, and their relations, which are called edges in a graph. For example, Anakin Skywalker would have a relation called 'father of' to Luke Skywalker. To extract the various relations would mean extensive natural language processing to reduce the corpus to forms of node-edge-node, which is far from trivial.

To make it a bit easier, we will sum all relations to a single kind of relation which we call 'connected to'. To find out if a character is connected to another, we will look if there is a link on the page. We expect that on the page of Anakin Skywalker there will be a link to the page of Luke Skywalker. All these links are collected during the scraping process as a list which we call crosslinks.

In [None]:
import pickle
from pathlib import Path
import urllib
import collections


import pandas as pd
from tqdm import tqdm
from networkx import nx


import matplotlib.pyplot as plt
import seaborn as sns
from pyvis.network import Network
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})

Import the raw data

In [None]:
files = sorted(Path('../Dataset').glob('*.pickle'))
data = {}
for fn in files:
    with open(fn, 'rb') as f:
        part = pickle.load(f)
    data.update(part)

def remove_url_shizzle(text):
    return urllib.parse.unquote(text).replace('"', '').replace("'", '')

cleaned = {}
for key, value in tqdm(data.items()):
    new_key = remove_url_shizzle(key)
    cleaned[new_key] = value
    cleaned[new_key]['crosslinks'] = [remove_url_shizzle(crosslink) for crosslink in value['crosslinks']]
data = cleaned

characters = pd.read_parquet('../Dataset/StarWars_Characters.parquet')['key'].tolist()


Create a network that connects all characters using their crosslinks.

In [None]:
graph = nx.Graph()
for key in tqdm(characters):
    crosslinks = data[key]['crosslinks']
    for crosslink in crosslinks:
        if crosslink in characters:
            graph.add_edge(key, crosslink)
print('Nodes: {0:d}, Links: {1:d}'.format(graph.number_of_nodes(),graph.number_of_edges()))

Degree distribution analysis

In [None]:
degree_sequence = sorted([d for n, d in graph.degree()]) 
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
degree_df = pd.DataFrame({'degree': deg, 'count':cnt})

fig, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.set_facecolor("white")
sns.barplot( x='degree', y='count', data=degree_df.loc[degree_df.degree<32])
_, _ = ax.set_ylabel("Count"), ax.set_xlabel("Degree")
sns.despine()

In [None]:
#Erase nodes with degree lower than 2, for a clearer network 
proceed = True
while proceed: 
    nodes_to_remove =[node for node, degree in graph.degree(graph.nodes) if degree < 2]
    if len(nodes_to_remove)==0:
        proceed = False
    else:
        print('Removing {0:d} nodes'.format(len(nodes_to_remove)))
        graph.remove_nodes_from(nodes_to_remove)
        print('Remaining Nodes: {0:d}, Links: {1:d}'.format(graph.number_of_nodes(),graph.number_of_edges()))

In [None]:
undirected_graph = graph.to_undirected()
print('Nodes: {0:d}, Links: {1:d}'.format(graph.number_of_nodes(),graph.number_of_edges()))

In [None]:
#Degree centrality
node_degree_centrality = nx.degree_centrality(graph)
counter_centrality = collections.Counter(node_degree_centrality)
print (counter_centrality.most_common(10))

In [None]:
network_df = pd.DataFrame(columns=['character','centrality_degree'])
network_df['character'] = counter_centrality.keys()
network_df['centrality_degree'] = counter_centrality.values()
network_df = network_df.sort_values("centrality_degree",ascending=False)
network_df.head()

In [None]:
plt.figure(figsize=(20,7))
sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="character", y="centrality_degree", data=network_df.head(10))
plt.title('Centrality Degree',fontsize=20)
plt.xlabel('Character',fontsize=15)
plt.ylabel('Centrality Degree',fontsize=15)
plt.xticks(rotation=60)

plt.show()

In [None]:
#betweeness centrality
node_betweeness_centrality = nx.algorithms.betweenness_centrality(graph)
counter_betweeness = collections.Counter(node_betweeness_centrality)
print (counter_betweeness.most_common(10))

In [None]:
network_df_betweeness = pd.DataFrame(columns=['character','betweeness_degree'])
network_df_betweeness['character'] = counter_betweeness.keys()
network_df_betweeness['betweeness_degree'] = counter_betweeness.values()
network_df_betweeness = network_df_betweeness.sort_values("betweeness_degree",ascending=False)
network_df_betweeness.head()

In [None]:
plt.figure(figsize=(20,7))
sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="character", y="betweeness_degree", data=network_df_betweeness.head(10))
plt.title('Betweeness Centrality Degree',fontsize=20)

plt.xlabel('Character',fontsize=15)
plt.ylabel('Betweeness Centrality Degree',fontsize=15)
plt.xticks(rotation=60)

plt.show()

In [None]:
#Closeness centrality
node_closeness_centrality = nx.closeness_centrality(graph)
counter_closeness_centrality = collections.Counter(node_closeness_centrality)
print (counter_closeness_centrality.most_common(10))

### Interactive Network plot using PyVis

In [None]:
def get_crosslink_table(key, n = 30, ignore_keys=[]):
    cl = data[key]['crosslinks']
    result = []
    for link in cl:
        if link in characters:
            n_cl = len(data[link]['crosslinks'])
            result.append({'key': link, 'n_links': n_cl})
    result = pd.DataFrame(result)
    return result.loc[~result.key.isin(ignore_keys)].sort_values('n_links', ascending=False)['key'].head(n).tolist()

In [None]:
level_colors =  {
    0:'#7A84DD',
    1:"#B15B60", 
    2:'#8ACAE5', 
    3:'#BD9267', 
    4:'#F1A54D', 
    5:'#020104',
}

def add_node(graph, key, level, max_level=2, n_crosslinks=10, ignore_keys=[]):
    label = key.replace('_', ' ')
    char = [{'name': label, 'description': data[key]['paragraph'].strip()}]
    textblock = pd.DataFrame(char).to_html(header=False, index=False, columns=['description'])
    G.add_node(
        label,
        title=textblock,
        size=10,
        color=level_colors[level],
        label=label,
    )
    if level < max_level:
        next_nodes = get_crosslink_table(key, n=n_crosslinks, ignore_keys=ignore_keys)
        for next_key in next_nodes:
            add_node(G, next_key, level + 1, max_level=max_level, n_crosslinks=n_crosslinks, ignore_keys=next_nodes + [key] + ignore_keys)
            next_label = next_key.replace('_', ' ')
            G.add_edge(
                label,
                next_label,
                weight=max_level / (1 + level),
                title=label+' -> '+next_label,
                width=1.5,
            )

max_level = 2
n_crosslinks = 15
start_key = 'Anakin_Skywalker'

# G = Network(height="1000px", width="100%", bgcolor="#222222", font_color="white",notebook=True)
G = Network(height="1000px", width="100%", bgcolor="#000000", font_color="white",notebook=True)
add_node(G, start_key, 0, max_level=max_level, n_crosslinks=n_crosslinks)

In [None]:
G.barnes_hut(gravity=-5000, central_gravity=0, spring_length=200, spring_strength=0.009, damping=0.025, overlap=0)
G.show('../Docs/index.html')