# Network Graphs

The aim of this notebook is to ...

## Imports

In [None]:
import itertools

import pandas as pd
import networkx as nx

from tqdm import tqdm
from scripts.helpers import get_similarities_from_json


In [None]:
df = pd.read_csv('../data/processed/preprocessed.csv', index_col=0)
df.set_index('wikipedia_id', inplace=True)
print(df.shape)
df.head(5)

In [None]:

similarity_dict = get_similarities_from_json(1920)
movies = list(set(itertools.chain(*similarity_dict.keys())))

In [None]:
# let's check the distribution of similarity values
similarity_values = list(similarity_dict.values())
_ = pd.Series(similarity_values).plot.hist(bins=50, title='Distribution of similarity values')

In [None]:
# let's show descriptive statistics
descriptive_stats = pd.Series(similarity_values).describe()
descriptive_stats

In [None]:
similarity_threshold = descriptive_stats['75%']  # let's take 75th percentile

nodes_df = pd.DataFrame(columns=['Source', 'Target', 'Weight'])

G = nx.Graph()
G.add_nodes_from(movies)
for (movie_1, movie_2), similarity in tqdm(similarity_dict.items()):
    if similarity > similarity_threshold:
        nodes_df.loc[len(nodes_df)] = [movie_1, movie_2, similarity]

G = nx.from_pandas_edgelist(nodes_df, 'Source', 'Target', 'Weight')

In [None]:
# add name as an attribute
names = dict((id_, df.loc[int(id_)]['name']) for id_ in G.nodes)
nx.set_node_attributes(G, name='name', values=names)

In [None]:
# add wikipedia id as an attribute
names = dict((id_, int(id_)) for id_ in G.nodes)
nx.set_node_attributes(G, name='wikipedia_id', values=names)

In [None]:
# calculate the betweenness centrality
betweenness = nx.betweenness_centrality(G)
nx.set_node_attributes(G, name='betweenness', values=betweenness)

In [None]:
# calculate degree for each node
degrees = dict(nx.degree(G))
nx.set_node_attributes(G, name='degree', values=degrees)

In [None]:
# add movie rating
ratings_dict = dict((id_, df.loc[int(id_)]['rating']) for id_ in G.nodes)
nx.set_node_attributes(G, name='rating', values=ratings_dict)

In [None]:
# add release data
date_dict = dict((id_, df.loc[int(id_)]['release_year']) for id_ in G.nodes)
nx.set_node_attributes(G, name="release_year", values=date_dict)