In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import json
import networkx as nx
from project_utils import *
from pandas.io.json import json_normalize
from functools import reduce
from scipy import sparse

In [None]:
credits_df = pd.read_csv("./data/tmdb_5000_credits.csv", sep=",", quotechar='"')
credits_df.head()

In [None]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

In [None]:
credits_df_cp = credits_df.copy()
movies_df_cp = movies_df.copy()

## Clean movies_df_cp

In [None]:
# movies with empty genre field
len(movies_df_cp[movies_df_cp.astype(str)["genres"] == "[]"])

In [None]:
# movies with empty production_companies field
len(movies_df_cp[movies_df_cp.astype(str)["production_companies"] == "[]"])

In [None]:
# movies with empty production_companies field
len(movies_df_cp[movies_df_cp.astype(str)["production_countries"] == "[]"])

In [None]:
# movies with empty production_companies field
len(movies_df_cp[movies_df_cp.astype(str)["spoken_languages"] == "[]"])

## Process movies df

In [None]:
pr_movies_df = movies_df_cp
len(pr_movies_df)

In [None]:
pr_movies_df.head()

In [None]:
# Transform json to list of dictionaries within cols
movies_df_json_cols = [
    "genres",
    "production_companies",
    "keywords",
    "production_countries",
    "spoken_languages",
]
cols_to_drop = ["homepage", "overview", "tagline", "original_title"]
key = "name"
movie_id_col = "movie_id"

pr_movies_df = col_json_to_dict(pr_movies_df, movies_df_json_cols)
for col in movies_df_json_cols:
    pr_movies_df = col_dict_to_set(pr_movies_df, col, key)
pr_movies_df = pr_movies_df.drop(columns=cols_to_drop)
pr_movies_df.head()

## Process credits df

In [None]:
pr_credits_df = credits_df_cp
credits_df_json_cols = ["cast", "crew"]
key = "name"
pr_credits_df = col_json_to_dict(pr_credits_df, credits_df_json_cols)
pr_credits_df = pr_credits_df.assign(actors=pr_credits_df[credits_df_json_cols[0]])
for col in credits_df_json_cols:
    if col != "crew":
        pr_credits_df = col_dict_to_set(pr_credits_df, col, key)
crew_col = "crew"
job_field = "job"
values = ["Director"]
# Filter dictionaries with Director as job
pr_credits_df = col_filter_dict_with_vals(
    pr_credits_df, crew_col, job_field, values
).rename({crew_col: values[0]})
pr_credits_df = col_dict_to_set(pr_credits_df, crew_col, key)
# Get actors col
# Only select main actors to reduce the size of the dataset
actors_col = "actors"
order_field = "order"
# Select the number of important actors
values = [0]
pr_credits_df = col_filter_dict_with_vals(
    pr_credits_df, actors_col, order_field, values
).rename({crew_col: values[0]})

get_value = lambda dict_: dict_.get(key)

pr_credits_df = pr_credits_df.explode(actors_col)
pr_credits_df = pr_credits_df[pr_credits_df[actors_col].notna()]
actors_series = pr_credits_df[actors_col]
pr_credits_df = pr_credits_df.assign(
    **{actors_col: pr_credits_df[actors_col].apply(get_value)}
)
# Remove actor from cast
actor_cast_col = "actor_cast"
pr_credits_df = pr_credits_df.reset_index(drop=True)
remove_from_cast = lambda row: row[credits_df_json_cols[0]] - set({row[actors_col]})
pr_credits_df = pr_credits_df.assign(
    **{credits_df_json_cols[0]: pr_credits_df.apply(remove_from_cast, axis=1)}
).drop(columns="title")
pr_credits_df.head()

In [None]:
actor_cols = ["name", "gender", "order"]
actors_df = json_normalize(actors_series)[actor_cols]
actors_df = (
    actors_df.rename(columns={actor_cols[0]: actors_col})
    .groupby(actors_col)
    .aggregate({actor_cols[1]: max, actor_cols[2]: list})
    .reset_index()
    .reset_index()
    .rename(columns={"index": "actor_id"})
)
actors_df

In [None]:
actors_credits_df = pr_credits_df.merge(
    actors_df, right_on=actors_col, left_on=actors_col, how="inner"
)
actors_credits_df.head(20)

In [None]:
movie_id_col = "movie_id"
new_movie_id_col = "new_movie_id"
actors_movies_credits_df = actors_credits_df.merge(
    pr_movies_df, right_on=movie_id_col, left_on=movie_id_col, how="inner"
)
# Set new movie ids
unique_movies_df = (
    pr_movies_df[movie_id_col]
    .to_frame()
    .drop_duplicates()
    .reset_index()
    .rename(columns={"index": new_movie_id_col})
)
actors_movies_credits_df = (
    actors_movies_credits_df.merge(
        unique_movies_df, right_on=movie_id_col, left_on=movie_id_col, how="inner"
    )
    .drop(columns=movie_id_col)
    .rename(columns={new_movie_id_col: movie_id_col})
)

actors_movies_credits_df.head()

In [None]:
aux_movies_df = unique_movies_df.merge(
    pr_movies_df[["movie_id", "title"]].drop_duplicates(),
    right_on="movie_id",
    left_on="movie_id",
)
aux_movies_df.head()

In [None]:
actors_movies_credits_df.columns

In [None]:
len(actors_movies_credits_df)

In [None]:
get_union = lambda s: reduce(set.union, s)
actor_id_col = "actor_id"
actors_agg_df = (
    actors_movies_credits_df.groupby(actors_col)
    .aggregate(
        {
            "movie_id": set,
            "cast": get_union,
            "crew": get_union,
            "actor_id": max,
            "gender": max,
            "budget": "mean",
            "genres": get_union,
            "keywords": get_union,
            "original_language": set,
            "popularity": "mean",
            "production_companies": get_union,
            "production_countries": get_union,
            "release_date": list,
            "revenue": "mean",
            "runtime": sum,
            "spoken_languages": get_union,
            "status": list,
            "title": set,
            "vote_average": "mean",
            "vote_count": "mean",
        }
    )
    .set_index(actor_id_col)
)
actors_agg_df.head()

## Build graph

In [None]:
nodes_df = actors_agg_df[["cast", "crew", "production_companies", "genres", "movie_id"]]
nodes_df.head()

In [None]:
dict_actor_id = dict(zip(actors_df[actors_col], actors_df[actor_id_col]))
dict_id_actor = dict(zip(actors_df[actor_id_col], actors_df[actors_col]))

## Get cast intersecctions length mat

## Get cast intersections

In [None]:
col = "cast"
cast_col = nodes_df[col]
cast_adj_raw = get_intersections_length_adj_mat(cast_col)
cast_adj_diag = np.diag(np.diag(cast_adj_raw))
cast_adj = cast_adj_raw - cast_adj_diag

In [None]:
plt.spy(cast_adj)

In [None]:
np.save("cast_adj", cast_adj)

## Load cast adjacency matrix

In [None]:
cast_adj = np.load("cast_adj.npy")

In [None]:
cast_adj.shape

In [None]:
node_degree = cast_adj.sum(axis=0)

In [None]:
most_connected_actors_id = np.argsort(-node_degree)[:20]
most_connected_actors = [dict_id_actor.get(id_) for id_ in most_connected_actors_id]
actors_df[actors_df[actor_id_col].isin(most_connected_actors_id)]
print(most_connected_actors)

## Get cast unions length mat

In [None]:
col = "cast"
cast_col = nodes_df[col]
cast_adj_union_raw = get_unions_length_adj_mat(cast_col)
cast_adj_union_diag = np.diag(np.diag(cast_adj_union_raw))
cast_adj_union = cast_adj_union_raw - cast_adj_union_diag

In [None]:
plt.imshow(cast_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("cast_adj_union", cast_adj_union)

## Get movies intersections length mat

In [None]:
col = "movie_id"
movie_col = nodes_df[col]
movie_adj_raw = get_intersections_length_adj_mat(movie_col)
movie_adj_diag = np.diag(np.diag(movie_adj_raw))
movie_adj = movie_adj_raw - movie_adj_diag

In [None]:
plt.spy(movie_adj)

In [None]:
np.save("movie_adj", movie_adj)

## Get movies union length mat

In [None]:
col = "movie_id"
movie_col = nodes_df[col]
movie_adj_union_raw = get_unions_length_adj_mat(movie_col)
movie_adj_union_diag = np.diag(np.diag(movie_adj_union_raw))
movie_adj_union = movie_adj_union_raw - movie_adj_union_diag

In [None]:
plt.imshow(movie_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("movie_adj_union", movie_adj_union)

## Get directors intersections lenght mat

In [None]:
col = "crew"
crew_col = nodes_df[col]
crew_adj_raw = get_intersections_length_adj_mat(crew_col)
crew_adj_diag = np.diag(np.diag(crew_adj_raw))
crew_adj = crew_adj_raw - crew_adj_diag

In [None]:
plt.spy(crew_adj)

In [None]:
np.save("crew_adj", crew_adj)

## Get directors union length mat

In [None]:
col = "crew"
crew_col = nodes_df[col]
crew_adj_union_raw = get_unions_length_adj_mat(crew_col)
crew_adj_union_diag = np.diag(np.diag(crew_adj_union_raw))
crew_adj_union = crew_adj_union_raw - crew_adj_union_diag

In [None]:
plt.imshow(crew_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("crew_adj_union", crew_adj_union)

## Get production companies intersections length mat

In [None]:
col = "production_companies"
prod_comp_col = nodes_df[col]
prod_comp_adj_raw = get_intersections_length_adj_mat(prod_comp_col)
prod_comp_adj_diag = np.diag(np.diag(prod_comp_adj_raw))
prod_comp_adj = prod_comp_adj_raw - prod_comp_adj_diag

In [None]:
plt.spy(prod_comp_adj)

In [None]:
np.save("prod_comp_adj", prod_comp_adj)

## Get production companies unions length mat

In [None]:
col = "production_companies"
prod_comp_col = nodes_df[col]
prod_comp_adj_union_raw = get_unions_length_adj_mat(prod_comp_col)
prod_comp_adj_union_diag = np.diag(np.diag(prod_comp_adj_union_raw))
prod_comp_adj_union = prod_comp_adj_union_raw - prod_comp_adj_union_diag

In [None]:
plt.imshow(prod_comp_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("prod_comp_adj_union", prod_comp_adj_union)

## Get genres intersections length mat

In [None]:
col = "genres"
genres_col = nodes_df[col]
genres_adj_raw = get_intersections_length_adj_mat(genres_col)
genres_adj_diag = np.diag(np.diag(genres_adj_raw))
genres_adj = genres_adj_raw - genres_adj_diag

In [None]:
plt.spy(genres_adj)

In [None]:
np.save("genres_adj", genres_adj)

## Get genres unions length mat

In [None]:
col = "genres"
genres_col = nodes_df[col]
genres_adj_union_raw = get_unions_length_adj_mat(genres_col)
genres_adj_union_diag = np.diag(np.diag(genres_adj_union_raw))
genres_adj_union = genres_adj_union_raw - genres_adj_union_diag

In [None]:
plt.imshow(genres_adj_union, cmap="hot", interpolation="none")

In [None]:
np.save("genres_adj_union", genres_adj_union)

## Aggregated adjacency matrix

In [None]:
agg_adj = np.divide(
    (
        0.3 * cast_adj
        + 0.3 * movie_adj
        + 0.2 * crew_adj
        + 0.1 * genres_adj
        + 0.1 * prod_comp_adj
    ),
    (
        0.3 * cast_adj_union
        + 0.3 * movie_adj_union
        + 0.2 * crew_adj_union
        + 0.1 * genres_adj_union
        + 0.1 * prod_comp_adj_union
    ),
)
agg_adj = np.where(np.isnan(agg_adj), 0, agg_adj)
np.sum(agg_adj)

In [None]:
plt.spy(agg_adj)

In [None]:
plt.imshow(agg_adj, cmap="hot", interpolation="none")

In [None]:
np.min(agg_adj)

In [None]:
np.max(agg_adj)

In [None]:
np.save("agg_adj", agg_adj)

In [None]:
np.load("agg_adj.npy")

## Sparsify graph

In [None]:
agg_adj = np.load("agg_adj.npy")

In [None]:
def sparsify_adj(adj,epsilon):
    return np.where(adj<=epsilon,0,adj)

In [None]:
percentile = 70
eps = np.percentile(agg_adj,70)
print(eps)

In [None]:
sparse_agg_adj = sparsify_adj(agg_adj,eps)

## Tests

### Test consistency of data

In [None]:
dict_actor_id.get("Sam Worthington")

In [None]:
actors_df[actors_df["actors"] == "Adriana Barraza"]

In [None]:
dict_actor_id.get("Andreas Berg")

In [None]:
actors_df[actors_df["actor_id"] == 1000]

In [None]:
nodes_df.loc[0, "cast"]

In [None]:
# Test number of actors
len(np.unique(actors_agg_df.index.values))

In [None]:
actors_agg_df

In [None]:
test_movie_id = actors_agg_df.loc[dict_actor_id.get("Sam Worthington")]["movie_id"]
test_movie_id

In [None]:
test_movies_df = unique_movies_df[
    unique_movies_df[new_movie_id_col].isin(test_movie_id)
]
test_movies_df = test_movies_df.merge(
    pr_movies_df, right_on=movie_id_col, left_on=movie_id_col
)
test_movies_df[[movie_id_col, new_movie_id_col, "title"]]

In [None]:
"Sam Worthington" in pr_credits_df.loc[0, "cast"]

In [None]:
pr_credits_df.loc[0]