In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import sparse
from project_utils import *

%load_ext autoreload
%autoreload 2

## Build graph

In [None]:
actors_agg_df = pd.read_pickle("actors_agg_df.pkl")

In [None]:
nodes_df = actors_agg_df[["cast","actor_name","crew", "production_companies", "genres", "movie_id"]]
nodes_df.head()

In [None]:
cast_adj[test_actor_1_id,test_actor_2_id]

## Get translations of ids to names

In [None]:
actors_col ="actor_name"

In [None]:
dict_id_actor = dict(zip(actors_agg_df.index, actors_agg_df[actors_col]))
dict_actor_id = dict(zip(actors_agg_df[actors_col], actors_agg_df.index))

## Get cast intersecctions length mat

## Get cast intersections

In [None]:
col = "cast"
cast_col = nodes_df[col]
cast_adj = get_intersections_length_adj_mat(cast_col)

In [None]:
plt.spy(cast_adj)

In [None]:
np.save("cast_adj", cast_adj)

**Test whether cast_adj[i,j] corresponds to the cardinality of the intersection of node[i] and node [j] casts**

In [None]:
test_actor_1 = nodes_df[nodes_df["actor_name"]=="Brad Pitt"]
test_actor_1_id = test_actor_1.index[0]
test_actor_2 = nodes_df[nodes_df["actor_name"]=="Angelina Jolie"]
test_actor_2_id = test_actor_2.index[0]
cast_adj_2_1 = len(test_actor_2["cast"][test_actor_2.index[0]].intersection(test_actor_1["cast"][test_actor_1.index[0]]))
assert cast_adj_2_1 == cast_adj[test_actor_1_id,test_actor_2_id], "The cast adjacency entries don't correspond to actor ids"
assert cast_adj_2_1 == cast_adj[test_actor_2_id,test_actor_1_id], "cast_adj is not symmetric"
print("Test passed")

### Print most connected actors

In [None]:
actor_id_col = "actor_id"
cast_node_deg = cast_adj.sum(axis=1)
most_connected_actors_id = np.argsort(-cast_node_deg)[:20]
most_connected_actors = [dict_id_actor.get(id_) for id_ in most_connected_actors_id]
print(most_connected_actors)

In [None]:
cast_node_deg_series = pd.Series(cast_node_deg)
plot_hist(cast_node_deg_series,"Cast adjacency node degree distribution","node degree","count",log = True)

In [None]:
cast_node_deg_series.describe()

## Get cast unions length mat

In [None]:
col = "cast"
cast_col = nodes_df[col]
cast_adj_union_raw = get_unions_length_adj_mat(cast_col)
cast_adj_union_diag = np.diag(np.diag(cast_adj_union_raw))
cast_adj_union = cast_adj_union_raw - cast_adj_union_diag

In [None]:
n_union_cast_members = cast_adj_union.flatten()

In [None]:
n_union_cast_member_series = pd.Series(n_union_cast_members)

In [None]:
np.max(cast_adj_union)

In [None]:
plt.imshow(cast_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("cast_adj_union", cast_adj_union)

## Get movies intersections length mat

In [None]:
col = "movie_id"
movie_col = nodes_df[col]
movie_adj_raw = get_intersections_length_adj_mat(movie_col)
movie_adj_diag = np.diag(np.diag(movie_adj_raw))
movie_adj = movie_adj_raw - movie_adj_diag

In [None]:
plt.spy(movie_adj)

In [None]:
n_shared_movies = pd.Series(movie_adj.flatten())
plot_hist(n_shared_movies,"Number shared movies histogram","# shared movies","count",log=True)

In [None]:
n_shared_movies.describe()

In [None]:
node_degree_movie_adj = movie_adj.sum(axis = 1)
print("Number of disconnected nodes: " + str((node_degree_movie_adj == 0).sum()))

In [None]:
np.save("movie_adj", movie_adj)

## Get movies union length mat

In [None]:
col = "movie_id"
movie_col = nodes_df[col]
movie_adj_union_raw = get_unions_length_adj_mat(movie_col)
movie_adj_union_diag = np.diag(np.diag(movie_adj_union_raw))
movie_adj_union = movie_adj_union_raw - movie_adj_union_diag

In [None]:
plt.imshow(movie_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("movie_adj_union", movie_adj_union)

## Get directors intersections length mat

In [None]:
col = "crew"
crew_col = nodes_df[col]
crew_adj_raw = get_intersections_length_adj_mat(crew_col)
crew_adj_diag = np.diag(np.diag(crew_adj_raw))
crew_adj = crew_adj_raw - crew_adj_diag

In [None]:
plt.spy(crew_adj)

In [None]:
np.save("crew_adj", crew_adj)

## Get directors union length mat

In [None]:
col = "crew"
crew_col = nodes_df[col]
crew_adj_union_raw = get_unions_length_adj_mat(crew_col)
crew_adj_union_diag = np.diag(np.diag(crew_adj_union_raw))
crew_adj_union = crew_adj_union_raw - crew_adj_union_diag

In [None]:
plt.imshow(crew_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("crew_adj_union", crew_adj_union)

## Get production companies intersections length mat

In [None]:
col = "production_companies"
prod_comp_col = nodes_df[col]
prod_comp_adj_raw = get_intersections_length_adj_mat(prod_comp_col)
prod_comp_adj_diag = np.diag(np.diag(prod_comp_adj_raw))
prod_comp_adj = prod_comp_adj_raw - prod_comp_adj_diag

In [None]:
plt.spy(prod_comp_adj)

In [None]:
np.save("prod_comp_adj", prod_comp_adj)

## Get production companies unions length mat

In [None]:
col = "production_companies"
prod_comp_col = nodes_df[col]
prod_comp_adj_union_raw = get_unions_length_adj_mat(prod_comp_col)
prod_comp_adj_union_diag = np.diag(np.diag(prod_comp_adj_union_raw))
prod_comp_adj_union = prod_comp_adj_union_raw - prod_comp_adj_union_diag

In [None]:
plt.imshow(prod_comp_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("prod_comp_adj_union", prod_comp_adj_union)

## Get genres intersections length mat

In [None]:
col = "genres"
genres_col = nodes_df[col]
genres_adj_raw = get_intersections_length_adj_mat(genres_col)
genres_adj_diag = np.diag(np.diag(genres_adj_raw))
genres_adj = genres_adj_raw - genres_adj_diag

In [None]:
plt.spy(genres_adj)

In [None]:
np.save("genres_adj", genres_adj)

## Get genres unions length mat

In [None]:
col = "genres"
genres_col = nodes_df[col]
genres_adj_union_raw = get_unions_length_adj_mat(genres_col)
genres_adj_union_diag = np.diag(np.diag(genres_adj_union_raw))
genres_adj_union = genres_adj_union_raw - genres_adj_union_diag

In [None]:
plt.imshow(genres_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("genres_adj_union", genres_adj_union)

## Aggregated adjacency matrix

In [None]:
# agg_adj = np.divide(
#     (
#         0.3 * cast_adj
#         + 0.3 * movie_adj
#         + 0.2 * crew_adj
#         + 0.1 * genres_adj
#         + 0.1 * prod_comp_adj
#     ),
#     (
#         0.3 * cast_adj_union
#         + 0.3 * movie_adj_union
#         + 0.2 * crew_adj_union
#         + 0.1 * genres_adj_union
#         + 0.1 * prod_comp_adj_union
#     ),
# )
agg_adj = cast_adj+  movie_adj + crew_adj + prod_comp_adj
agg_adj = agg_adj/np.max(agg_adj)     
        
agg_adj = np.where(np.isnan(agg_adj), 0, agg_adj)
np.sum(agg_adj)

In [None]:
plt.spy(agg_adj)

In [None]:
plt.imshow(agg_adj, cmap="hot", interpolation="none")

In [None]:
np.min(agg_adj)

In [None]:
np.max(agg_adj)

In [None]:
np.save("agg_adj", agg_adj)

## Sparsify graph

In [None]:
percentile = 70
eps = np.percentile(agg_adj, percentile)
print(eps)

In [None]:
sparse_agg_adj = sparsify_mat(agg_adj, eps)

In [None]:
plt.spy(sparse_agg_adj)

In [None]:
np.save("sparse_agg_actor_adj", sparse_agg_adj)