# Title

## Imports

In [1]:
import itertools
import colorcet as cc
import random

import networkx as nx
import numpy as np
import pandas as pd

from bokeh.io import output_file, show, output_notebook, save
from bokeh.plotting import figure, from_networkx
from bokeh.models import Circle, MultiLine
from bokeh.transform import linear_cmap
from bokeh.palettes import Blues8, RdBu, cividis, OrRd, magma, plasma, viridis

from scripts.helpers import get_similarities_from_json

## Betweenness

In [2]:
G = nx.Graph()

# let's define edges
edges = [
    ('a', 'b'), ('a', 'c'), ('a', 'd'), ('c', 'd'), ('c', 'e'), ('b', 'e'),
    ('f', 'e'), ('f', 'h'), ('f', 'g'), ('g', 'h'), ('b', 'd'), ('i', 'g'), 
    ('i', 'f'), ('h', 'e'), ('g', 'e')
]

for node_1, node_2 in edges:
    G.add_edge(node_1, node_2)

In [3]:
betweenness = nx.betweenness_centrality(G)
nx.set_node_attributes(G, name='betweenness', values=betweenness)

In [4]:
# add name as an attribute
names = dict((id_, id_) for id_ in G.nodes)
nx.set_node_attributes(G, name='name', values=names)

In [5]:
# some annoying bokeh thing
mapping = dict((n, i) for i, n in enumerate(G.nodes))
H = nx.relabel_nodes(G, mapping)

In [6]:
output_notebook()

HOVER_TOOLTIPS = [("Betweenness", "@betweenness{0.00}")]

plot = figure(
    sizing_mode='stretch_width',
    height=455,
    tools="pan, wheel_zoom, save, reset",
    active_scroll='wheel_zoom', 
    toolbar_location=None,
    tooltips=HOVER_TOOLTIPS
)

plot.axis.visible = False
plot.grid.visible = False
plot.background_fill_color = "white"
plot.border_fill_color = "white" # (255, 255, 255, 0)

network_graph = from_networkx(H, layout_function=nx.spring_layout, scale=10)
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=0.5)

# set node sizes and colors according to node betweenness (color as spectrum of color palette)
color_attribute = 'betweenness'
min_color = min(network_graph.node_renderer.data_source.data[color_attribute])
max_color = max(network_graph.node_renderer.data_source.data[color_attribute])
network_graph.node_renderer.glyph = Circle(
    size=40, 
    fill_color=linear_cmap(color_attribute, cividis(256), min_color, max_color)
)

# set edge opacity and width


plot.renderers.append(network_graph)

show(plot)

# output_file('betweenness.html', mode='inline')
# save(plot)

## Degree

In [7]:
G_deg = nx.Graph()

# let's define edges
edges = [
    ('b', 'o'), ('f', 'u'),
    ('a', 'b'), ('a', 'c'), ('a', 'd'), ('a', 'e'), ('a', 'f'), 
    ('b', 'c'), ('b', 'd'), ('b', 'e'), ('c', 'd'), ('c', 'e'),    
    ('f', 'c'), ('f', 'd'), ('f', 'e'), ('g', 'b'), ('g', 'd'), ('g', 'f')
]

for node_1, node_2 in edges:
    G_deg.add_edge(node_1, node_2)

In [8]:
degree = nx.degree_centrality(G_deg)
nx.set_node_attributes(G_deg, name='degree', values=degree)

In [9]:
betweenness = nx.betweenness_centrality(G_deg)
nx.set_node_attributes(G_deg, name='betweenness', values=betweenness)

In [10]:
# add name as an attribute
names = dict((id_, id_) for id_ in G_deg.nodes)
nx.set_node_attributes(G_deg, name='name', values=names)

In [11]:
# some annoying bokeh thing
mapping = dict((n, i) for i, n in enumerate(G_deg.nodes))
H_deg = nx.relabel_nodes(G_deg, mapping)

In [12]:
output_notebook()

HOVER_TOOLTIPS = [("degree", "@degree{0.00}"), ("betweenness", "@betweenness{0.00}")]

plot = figure(
    sizing_mode='stretch_width',
    height=455,
    tools="pan, wheel_zoom, save, reset",
    active_scroll='wheel_zoom', 
    toolbar_location=None,
    tooltips=HOVER_TOOLTIPS
)

plot.axis.visible = False
plot.grid.visible = False
plot.background_fill_color = "white"
plot.border_fill_color = "white"

network_graph = from_networkx(H_deg, layout_function=nx.spring_layout, scale=10)
network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=0.5)

# set node sizes and colors according to node betweenness (color as spectrum of color palette)
color_attribute = 'degree'
min_color = min(network_graph.node_renderer.data_source.data[color_attribute])
max_color = max(network_graph.node_renderer.data_source.data[color_attribute])
network_graph.node_renderer.glyph = Circle(
    size=40, 
    fill_color=linear_cmap(color_attribute, cividis(256), min_color, max_color)
)

# set edge opacity and width


plot.renderers.append(network_graph)

show(plot)

# output_file('degree.html', mode='inline')
# save(plot)

## Similarity Matrix

In [13]:
df = pd.read_csv('../../data/processed/preprocessed.csv', index_col=0)
df.set_index('wikipedia_id', inplace=True)
print(df.shape)
df.head(5)

(22394, 9)


Unnamed: 0_level_0,name,release_year,rating,num_votes,plot,freebase_id,languages,countries,genres
wikipedia_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10109752,Miss Jerry,1894.0,5.3,207,After finding out that her father is suffering...,/m/02q23xk,['Silent film'],['United States of America'],"['Short Film', 'Silent film', 'Indie', 'Black-..."
28703057,The Corbett-Fitzsimmons Fight,1897.0,5.3,484,The film no longer exists in its entirety; how...,/m/0czdh_n,[],[],['Sports']
142995,The Story of the Kelly Gang,1906.0,6.0,855,The Story of the Kelly Gangs tone is of sorrow...,/m/0120y4,['English Language'],['Australia'],"['Crime Fiction', 'Silent film', 'Biography', ..."
32986669,Robbery Under Arms,1907.0,4.3,25,Key scenes of the film included the branding o...,/m/04p7yxx,['Silent film'],['Australia'],"['Silent film', 'Drama']"
32987200,"Captain Midnight, the Bush King",1911.0,5.4,18,Edgar Dalimore is the son of wealthy station o...,/m/0h569x9,['Silent film'],[],['Silent film']


In [14]:
similarity_dict = get_similarities_from_json(2010)
movies = list(set(itertools.chain(*similarity_dict.keys())))

subsample_movies = random.sample(movies, 20)
subsample_combinations = list(itertools.combinations(subsample_movies, 2))

similarity_df = pd.DataFrame(columns=['movie_1', 'movie_2', 'similarity'], dtype=float)

for id_1, id_2 in subsample_combinations:
    
    if id_1 == id_2:
        similarity_df.loc[len(similarity_df)] = [df.loc[id_1]['name'], df.loc[id_2]['name'], 100]
        continue
        
    try: 
        similarity = similarity_dict[(id_1, id_2)]
    except KeyError:
        similarity = similarity_dict[(id_2, id_1)]
    
    similarity_df.loc[len(similarity_df)] = [df.loc[id_1]['name'], df.loc[id_2]['name'], similarity * 100]

In [15]:
similarity_df

Unnamed: 0,movie_1,movie_2,similarity
0,An Inconvenient Tax,Aha Naa Pellanta,71.103093
1,An Inconvenient Tax,Noy,71.731201
2,An Inconvenient Tax,Kadaksham,71.083490
3,An Inconvenient Tax,Quarantine 2: Terminal,70.391632
4,An Inconvenient Tax,Life Express,71.594582
...,...,...,...
185,Brake,Antiviral,77.981437
186,Brake,Grandmaster,73.999715
187,Kick-Ass,Antiviral,73.899170
188,Kick-Ass,Grandmaster,73.382333


In [16]:
from math import pi

import pandas as pd

from bokeh.models import BasicTicker, PrintfTickFormatter
from bokeh.plotting import figure, show
from bokeh.transform import linear_cmap

output_notebook()

movies = similarity_df.movie_1.unique().tolist()

TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

plot = figure(
    x_range=movies, 
    y_range=movies[::-1],
    x_axis_location="above", 
    sizing_mode='stretch_width',
    aspect_ratio=1,
    tooltips=[('movies', '@movie_1 @ @movie_2'), ('similarity', '@similarity{0.00}')],
    toolbar_location=None
)

plot.grid.grid_line_color = None
plot.axis.axis_line_color = None
plot.axis.major_tick_line_color = None
plot.axis.major_label_text_font_size = "12px"
plot.axis.major_label_standoff = 0
plot.xaxis.major_label_orientation = pi / 3

plot.yaxis.major_label_text_color = "black"
plot.xaxis.major_label_text_color = "black"

plot.background_fill_color = "white"
plot.border_fill_color = "white"

r = plot.rect(x="movie_1", y="movie_2", width=1, height=1, source=similarity_df,
           fill_color=linear_cmap("similarity", cividis(256), low=similarity_df.similarity.min(), high=similarity_df.similarity.max()),
           line_color=None)

_ = plot.rect(x="movie_2", y="movie_1", width=1, height=1, source=similarity_df,
           fill_color=linear_cmap("similarity", cividis(256), low=similarity_df.similarity.min(), high=similarity_df.similarity.max()),
           line_color=None)

plot.add_layout(r.construct_color_bar(
    major_label_text_font_size="12px",
    ticker=BasicTicker(desired_num_ticks=10),
    formatter=PrintfTickFormatter(format="%d%%"),
    label_standoff=12,
    background_fill_alpha=0,
    major_label_text_color= "black",
    border_line_color=None,
    padding=10,
), 'right')

show(plot)

# output_file('similarity_matrix.html', mode='inline')
# save(plot)

## Distribution

In [43]:
from numpy import linspace
from scipy.stats import gaussian_kde

from bokeh.models import ColumnDataSource, FixedTicker, PrintfTickFormatter
from bokeh.plotting import figure, show
from bokeh.sampledata.perceptions import probly

output_notebook()

def ridge(category, data, scale=20):
    return list(zip([category]*len(data), scale*data))

decades = [str(decade) for decade in range(1910, 1960, 10)]
palette = [cc.rainbow[i * 15] for i in range(len(decades))]

x = linspace(60, 90, 500)
source = ColumnDataSource(data=dict(x=x))

p = figure(
    y_range=decades, 
    width=900, 
    x_range=(65, 85), 
    # toolbar_location=None
)

for i, decade in enumerate(reversed(decades)):
    print(decade)
    similarities = get_similarities_from_json(int(decade))
    similarities.update((key, value * 100) for key, value in similarities.items())
    pdf = gaussian_kde(list(similarities.values()))
    
    pdf_x = pdf(x) / 5
    
    y = ridge(decade, pdf_x)
    source.add(y, decade)
    p.patch('x', decade, color=palette[i], alpha=0.6, line_color="black", source=source)

p.outline_line_color = None
p.background_fill_color = "#efefef"

p.xaxis.ticker = FixedTicker(ticks=list(range(0, 101, 10)))
p.xaxis.formatter = PrintfTickFormatter(format="%d%%")

p.ygrid.grid_line_color = None
p.xgrid.grid_line_color = "#dddddd"
p.xgrid.ticker = p.xaxis.ticker

p.axis.minor_tick_line_color = None
p.axis.major_tick_line_color = None
p.axis.axis_line_color = None

p.y_range.range_padding = 0.12

show(p)

1950
1940
1930
1920
1910


In [37]:
import colorcet as cc
from numpy import linspace
from scipy.stats import gaussian_kde

from bokeh.models import ColumnDataSource, FixedTicker, PrintfTickFormatter
from bokeh.plotting import figure, show
from bokeh.sampledata.perceptions import probly


def ridge(category, data, scale=20):
    return list(zip([category]*len(data), scale*data))

cats = list(reversed(probly.keys()))

palette = [cc.rainbow[i*15] for i in range(17)]

x = linspace(-20, 110, 500)

source = ColumnDataSource(data=dict(x=x))

p = figure(y_range=cats, width=900, x_range=(-5, 105), toolbar_location=None)

for i, cat in enumerate(reversed(cats)):
    pdf = gaussian_kde(probly[cat])
    y = ridge(cat, pdf(x))
    source.add(y, cat)
    p.patch('x', cat, color=palette[i], alpha=0.6, line_color="black", source=source)

p.outline_line_color = None
p.background_fill_color = "#efefef"

p.xaxis.ticker = FixedTicker(ticks=list(range(0, 101, 10)))
p.xaxis.formatter = PrintfTickFormatter(format="%d%%")

p.ygrid.grid_line_color = None
p.xgrid.grid_line_color = "#dddddd"
p.xgrid.ticker = p.xaxis.ticker

p.axis.minor_tick_line_color = None
p.axis.major_tick_line_color = None
p.axis.axis_line_color = None

p.y_range.range_padding = 0.12

show(p)