# Embedding 

The aim of this notebook is to test the embedding model and see if the results look logical for the naked eye. 
What is more, Bokeh and pyvis will be tested for generating the network graphs.

## Imports

In [2]:
import copy
# add root path to system path
import sys

sys.path.append('../')

In [13]:
import itertools

import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from bokeh.plotting import figure, from_networkx
from bokeh.transform import linear_cmap
from bokeh.models import Circle, MultiLine
from bokeh.io import output_notebook, show
from bokeh.palettes import Blues8, viridis
from scripts.helpers import get_embedding, get_similarities_from_json, get_graph_from_pickle, merge_graph_to_df
from pyvis.network import Network

## Data

Note that data preprocessing has been done in [preprocess.ipynb](preprocess.ipynb). The notebook will clean, merge and save the data to `data/processed`.

In [4]:
df = pd.read_csv('../data/processed/preprocessed.csv', index_col=0)
df.set_index('wikipedia_id', inplace=True)
print(df.shape)
df.head(5)

(22394, 9)


Unnamed: 0_level_0,name,release_year,rating,num_votes,plot,freebase_id,languages,countries,genres
wikipedia_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10109752,Miss Jerry,1894.0,5.3,207,After finding out that her father is suffering...,/m/02q23xk,['Silent film'],['United States of America'],"['Short Film', 'Silent film', 'Indie', 'Black-..."
28703057,The Corbett-Fitzsimmons Fight,1897.0,5.3,484,The film no longer exists in its entirety; how...,/m/0czdh_n,[],[],['Sports']
142995,The Story of the Kelly Gang,1906.0,6.0,855,The Story of the Kelly Gangs tone is of sorrow...,/m/0120y4,['English Language'],['Australia'],"['Crime Fiction', 'Silent film', 'Biography', ..."
32986669,Robbery Under Arms,1907.0,4.3,25,Key scenes of the film included the branding o...,/m/04p7yxx,['Silent film'],['Australia'],"['Silent film', 'Drama']"
32987200,"Captain Midnight, the Bush King",1911.0,5.4,18,Edgar Dalimore is the son of wealthy station o...,/m/0h569x9,['Silent film'],[],['Silent film']


### Data Analysis

### Similarity Matrix

Once we have vectorized the plots, we can calculate the similarities between the plots with dot product.

## Graph Generation

Now that we have the similarities between the movies, we can set the threshold and generate a network of nodes.

### NetworkX

In [5]:
G = get_graph_from_pickle(1920)
df = merge_graph_to_df(df, G)
df.head(5)

Unnamed: 0,name,release_year,rating,num_votes,plot,freebase_id,languages,countries,genres,betweenness,degree
28776744,The Amateur Gentleman,1920.0,4.0,14,In Regency Britain a young man tries to establ...,/m/0ddf1g6,['English Language'],['United Kingdom'],['Adventure'],0.018261,196
11396051,L'Atlantide,1921.0,6.7,419,"In 1911, two French officers, Capitaine Morhan...",/m/02rb35g,['Silent film'],['France'],"['Silent film', 'Indie', 'World cinema', 'Blac...",0.000532,129
32932657,The Breaking of the Drought,1920.0,5.2,27,Drought causes Jo Galloway to lose possession...,/m/0h3vzny,"['Silent film', 'English Language']",[],['Silent film'],0.000149,57
8691806,The Flapper,1920.0,6.4,412,"16-year-old Genevieve 'Ginger' King , is growi...",/m/027f4nv,"['Silent film', 'English Language']",['United States of America'],"['Silent film', 'Comedy', 'Black-and-white', '...",5.2e-05,65
9862787,Genuine,1920.0,5.9,1105,"Since completing a portrait of Genuine, a high...",/m/02pv61s,"['Silent film', 'German Language']","['Weimar Republic', 'Germany']","['Silent film', 'Horror', 'Indie']",0.000831,153


### Bokeh

In [7]:
ratings_dict = nx.get_node_attributes(G, 'rating')
ratings = np.array(list(ratings_dict.values()))
names = list(ratings_dict.keys())
# adjust the size of the nodes by normalizing the revenue
ratings_normalized = (ratings - ratings.min()) / (ratings.max() - ratings.min())

adjusted_node_size = dict((id_, rating * 3) for id_, rating in zip(names, ratings.tolist()))
nx.set_node_attributes(G, name='adjusted_node_size', values=adjusted_node_size)

In [8]:
# hacky way to rename the nodes since bokeh couldn't otherwise handle that
mapping = dict((n, i) for i, n in enumerate(G.nodes))
H = nx.relabel_nodes(G, mapping)

In [76]:
js_file_path = '../scripts/bokeh_callback.js'

with open(js_file_path, 'r') as js_file:
    js_callback = js_file.read()

In [84]:
from bokeh.models import Slider, CustomJS, TabPanel, Tabs, MultiChoice, NodesAndLinkedEdges
from bokeh.layouts import column
from bokeh import events
from bokeh.plotting import figure, output_file, save, curdoc

size_by_this_attribute = 'adjusted_node_size'
color_by_this_attribute = 'betweenness'


def _get_slider(graph, attributes):
        
    # Add a Slider widget for the release date
    release_date_slider = Slider(
        start=min(attributes['release_year']), 
        end=max(attributes['release_year']), 
        value=min(attributes['release_year']), 
        step=1, 
        title="Release Year"
    )
    
    input_feats = {
        'graph': graph,
        'node_dict': graph.node_renderer.data_source.data.copy(),
        'edges_dict': graph.edge_renderer.data_source.data.copy()
    }
    
    # Create a callback function to update the plot based on the selected release date
    callback = CustomJS(args=input_feats, code=js_callback)
    
    # Attach the callback to the slider
    release_date_slider.js_on_change('value', callback)
    curdoc().on_event(events.DocumentReady, callback)
    
    return release_date_slider


def _get_multichoice(graph, options):
    multi_choice = MultiChoice(value=options, options=options)   
    
    input_feats = {
        'graph': graph,
        'node_dict': graph.node_renderer.data_source.data.copy(),
        'edges_dict': graph.edge_renderer.data_source.data.copy()
    }
    
    # Create a callback function to update the plot based on the selected release date
    callback = CustomJS(args=input_feats, code=js_callback)
    
    # Attach the callback to the slider
    multi_choice.js_on_change('value', callback)
    curdoc().on_event(events.DocumentReady, callback)
    
    return multi_choice


def plot_bokeh_graph(graph):
    
    # hacky way to rename the nodes since bokeh couldn't otherwise handle that
    # mapping = dict((n, i) for i, n in enumerate(graph.nodes))
    # G = nx.relabel_nodes(graph, mapping)  
    G = graph
    
    curdoc().theme = 'light_minimal'

    color_palette = viridis(256)
    
    # establish which categories will appear when hovering over each node
    tooltips = [
        ("Name", "@name"),
        ("Release year", "@release_year"),
        ("Rating", "@rating{0.0}"),
        ("Degree", "@degree{0}"),
        ("Betweenness", "@betweenness{0.00}"),
    ]
    
    # create a plot — set dimensions, toolbar, and title
    plot = figure(
        sizing_mode='stretch_width',
        height=800,
        tooltips=tooltips,
        toolbar_location=None,
        tools="pan,wheel_zoom,tap",
        active_scroll='wheel_zoom'
    )
    
    plot.axis.visible = False
    plot.grid.visible = False
    plot.outline_line_color = None
    
    # create a network graph object with spring layout
    bokeh_graph = from_networkx(G, nx.spring_layout, scale=10)
    
    # set node sizes and colors according to node degree (color as spectrum of color palette)
    node_attributes = bokeh_graph.node_renderer.data_source.data
    attribute = node_attributes[color_by_this_attribute]
    colormap = linear_cmap(color_by_this_attribute, color_palette, min(attribute), max(attribute))
    
    bokeh_graph.node_renderer.glyph = Circle(size=15, fill_color=colormap)

    # set edge opacity and width
    bokeh_graph.edge_renderer.glyph = MultiLine(line_alpha=0.3, line_width=1, line_color="#CCCCCC")
    bokeh_graph.edge_renderer.selection_glyph = MultiLine(line_alpha=1, line_width=1.3, line_color="yelloe")
    
    bokeh_graph.selection_policy = NodesAndLinkedEdges()
    bokeh_graph.inspection_policy = NodesAndLinkedEdges()
    
    # add network graph to the plot
    plot.renderers.append(bokeh_graph)
    
    unique_years = [str(int(year)) for year in np.unique(node_attributes['release_year']).tolist()]
    widget = _get_multichoice(bokeh_graph, unique_years)

    # Create a layout with the plot and the slider
    return column(widget, plot, sizing_mode='stretch_width', height=800)


In [85]:
graph = get_graph_from_pickle(1920)
layout = plot_bokeh_graph(graph)
show(layout)

In [82]:
tabs = []
decades = [1920, 1930, 1940]

for decade in decades:
    graph = get_graph_from_pickle(decade)
    layout = plot_bokeh_graph(graph)
    tabs.append(TabPanel(child=layout, title=str(decade)))

tabs = Tabs(tabs=tabs, sizing_mode='stretch_width', height=800)

# show(tabs)

output_file('tabs_test.html')
save(tabs)

'/Users/jan.kokla/Documents/EPFL/ada-2023-project-adaroundtheworld2023/notebooks/tabs_test.html'