# Build actor dataframe

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
from project_utils import *
from pandas.io.json import json_normalize
from functools import reduce

%reload_ext lab_black

In [None]:
credits_df = pd.read_csv("./data/tmdb_5000_credits.csv", sep=",", quotechar='"')
credits_df.head()

In [None]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

In [None]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

## Process movies DataFrame
Flatten and clean movies DataFrame

In [None]:
pr_movies_df = movies_df.copy()
# Transform json to list of dictionaries within cols
movies_df_json_cols = [
    "genres",
    "production_companies",
    "keywords",
    "production_countries",
    "spoken_languages",
]
cols_to_drop = ["homepage", "overview", "tagline", "original_title"]
key = "name"
movie_id_col = "movie_id"
pr_movies_df = col_json_to_dict(pr_movies_df, movies_df_json_cols)

# Transform list of dictionaries to sets containing a relevant value from dictionaries
for col in movies_df_json_cols:
    pr_movies_df = col_dict_to_set(pr_movies_df, col, key)
pr_movies_df = pr_movies_df.drop(columns=cols_to_drop)
pr_movies_df.head()

## Process credits DataFrame
Flatten and clean credits DataFrame

In [None]:
pr_credits_df = credits_df.copy()

# Parse json cols to list of dictionaries
credits_df_json_cols = ["cast", "crew"]
key = "name"
pr_credits_df = col_json_to_dict(pr_credits_df, credits_df_json_cols)
# Create the actors col
pr_credits_df = pr_credits_df.assign(actors=pr_credits_df[credits_df_json_cols[0]])
# Get sets of relevant values from the lists of dictionaries
for col in credits_df_json_cols:
    if col != "crew":
        pr_credits_df = col_dict_to_set(pr_credits_df, col, key)

In [None]:
# Filter crew dictionaries with Director as job
crew_col = "crew"
job_field = "job"
values = ["Director"]
pr_credits_df = col_filter_dict_with_vals(
    pr_credits_df, crew_col, job_field, values
).rename({crew_col: values[0]})
pr_credits_df = col_dict_to_set(pr_credits_df, crew_col, key)

In [None]:
# Select a specific number of actors as nodes
actors_col = "actors"
order_field = "order"
# Set the actor importances (the order key. The main actor is order 0) to consider for taking an actor as a node
values = [0]
pr_credits_df = col_filter_dict_with_vals(
    pr_credits_df, actors_col, order_field, values
).rename({crew_col: values[0]})

get_value = lambda dict_: dict_.get(key)

pr_credits_df = pr_credits_df.explode(actors_col)
pr_credits_df = pr_credits_df[pr_credits_df[actors_col].notna()]
actors_series = pr_credits_df[actors_col]
pr_credits_df = pr_credits_df.assign(
    **{actors_col: pr_credits_df[actors_col].apply(get_value)}
)

In [None]:
# Remove actors taken as nodes from cast column
actor_cast_col = "actor_cast"
pr_credits_df = pr_credits_df.reset_index(drop=True)
remove_from_cast = lambda row: row[credits_df_json_cols[0]] - set({row[actors_col]})
pr_credits_df = pr_credits_df.assign(
    **{credits_df_json_cols[0]: pr_credits_df.apply(remove_from_cast, axis=1)}
).drop(columns="title")
pr_credits_df.head()

## Build actors index DataFrame
Get actors index and most relevant features

In [None]:
actor_cols = ["name", "gender", "order"]
actors_idx_df = json_normalize(actors_series)[actor_cols]
actors_idx_df = (
    actors_idx_df.rename(columns={actor_cols[0]: actors_col})
    .groupby(actors_col)
    .aggregate({actor_cols[1]: max, actor_cols[2]: list})
    .reset_index()
    .reset_index()
    .rename(columns={"index": "actor_id"})
)
actors_idx_df.head()

## Build actors_index-credits DataFrame
Add actor features to the credits DataFrame

In [None]:
actors_idx_credits_df = pr_credits_df.merge(
    actors_idx_df, right_on=actors_col, left_on=actors_col, how="inner"
)
actors_idx_credits_df.head(20)

## Build movies index DataFrame
Re-index movies 

In [None]:
movie_id_col = "movie_id"
title_col = "title"
new_movie_id_col = "new_movie_id"
movies_idx_df = (
    pr_movies_df[[movie_id_col, title_col]]
    .drop_duplicates()
    .reset_index()
    .rename(columns={"index": new_movie_id_col})
)
movies_idx_df.head()

## Build actors-movies-credits DataFrame
Get actors from each movie and append their respective credits

In [None]:
actors_movies_credits_df = actors_idx_credits_df.merge(
    pr_movies_df, right_on=movie_id_col, left_on=movie_id_col, how="inner"
)
actors_movies_credits_df = (
    actors_movies_credits_df.merge(
        movies_idx_df.drop(columns=title_col),
        right_on=movie_id_col,
        left_on=movie_id_col,
        how="inner",
    )
    .drop(columns=movie_id_col)
    .rename(columns={new_movie_id_col: movie_id_col})
)
actors_movies_credits_df.head()

## Build actors aggregated DataFrame

In [None]:
get_union = lambda s: reduce(set.union, s)
actor_id_col = "actor_id"
actors_agg_df = (
    actors_movies_credits_df.groupby(actors_col, as_index=False)
    .aggregate(
        {
            "movie_id": set,
            "cast": get_union,
            "crew": get_union,
            "actor_id": max,
            "gender": max,
            "budget": "mean",
            "genres": get_union,
            "keywords": get_union,
            "original_language": set,
            "popularity": "mean",
            "production_companies": get_union,
            "production_countries": get_union,
            "release_date": list,
            "revenue": "mean",
            "runtime": sum,
            "spoken_languages": get_union,
            "status": list,
            "title": set,
            "vote_average": "mean",
            "vote_count": "mean",
        }
    )
    .set_index(actor_id_col)
)
actors_agg_df.head()

In [None]:
actors_agg_df.to_csv("actors_agg_df.csv")

## Tests

In [None]:
actors_agg_df = pd.read_csv("actors_agg_df.csv").replace("'", "", regex=True)

In [None]:
actors_agg_df.head()