# Build IMDb actor dataframe

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import csv
from ast import literal_eval
from project_utils import *
from pandas.io.json import json_normalize
from functools import reduce

%reload_ext lab_black
%load_ext autoreload
%autoreload 2

In [None]:
credits_df = pd.read_csv("./data/tmdb_5000_credits.csv", sep=",", quotechar='"')
credits_df.head()

In [None]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

## Process movies DataFrame
Flatten and clean movies DataFrame

1. Remove movies with invalid data

In [None]:
pr_movies_df = movies_df.copy()
min_vote_count = 40
pr_movies_df = pr_movies_df[
    (pr_movies_df.astype(str)["genres"] != "[]")
    & (pr_movies_df.astype(str)["production_companies"] != "[]")
    & (pr_movies_df["budget"] != 0)
    & (pr_movies_df["revenue"] != 0)
    & (pr_movies_df["popularity"] != 0)
    & (pr_movies_df["runtime"] != 0)
    & (pr_movies_df["vote_average"] != 0)
    & (pr_movies_df["vote_count"] >= min_vote_count)
]

2. Save a valid movies dataframe for unit tests

In [None]:
valid_movies_df = pr_movies_df.copy()

3. Normalize vote_average according to weighted rating formula:

\begin{equation*}
    WR = \frac{vr+mc}{v+m}
\end{equation*}
where: <br>
v: is the vote_count <br>
r: is the average rating of the movie <br>
m: is the minimum vote count <br>
c: is the average of the average rating across all the movies <br>

In [None]:
norm_vote_avg_col = "norm_vote_avg"
norm_vote_avg = normalize_vote_rating(
    pr_movies_df["vote_average"], pr_movies_df["vote_count"]
)
pr_movies_df = pr_movies_df.assign(**{norm_vote_avg_col: norm_vote_avg})

4. **Check if normalization using the weighted rating formula gives reasonable values:**

In [None]:
pr_movies_df[["vote_average", "vote_count", norm_vote_avg_col]].head()

In [None]:
pr_movies_df[["vote_average", "vote_count", norm_vote_avg_col]].tail()

5. Transform json to list of dictionaries for the selected cols

In [None]:
movies_df_json_cols = [
    "genres",
    "production_companies",
    "keywords",
    "production_countries",
]
pr_movies_df = col_json_to_dict(pr_movies_df, movies_df_json_cols)
pr_movies_df.head()

5. Transform the list of dictionaries to a set with relevant values

In [None]:
key = "name"
for col in movies_df_json_cols:
    pr_movies_df = col_dict_to_set(pr_movies_df, col, key)
pr_movies_df.head()

6. Drop unuseful columns

In [None]:
cols_to_drop = ["homepage", "overview", "tagline", "original_title", "spoken_languages"]
pr_movies_df = pr_movies_df.drop(columns=cols_to_drop)
pr_movies_df.head()

7. Reindex movies

In [None]:
new_movie_id_col = "new_movie_id"
new_movie_id_df = (
    pr_movies_df["movie_id"].reset_index().rename({"index": new_movie_id_col}, axis=1)
)
new_movie_id_df.head()

In [None]:
print("Number of movies reduced to: " + str(len(new_movie_id_df)))

## Process credits DataFrame
Flatten and clean credits DataFrame

1. Filter the credits only for valid movies

In [None]:
pr_credits_df = credits_df.copy()
pr_credits_df = pr_credits_df.merge(new_movie_id_df, on="movie_id", how="inner")
pr_credits_df.head()

2. Clean the credits

In [None]:
print("Raw credits dataframe length: " + str(len(pr_credits_df)))
pr_credits_df = pr_credits_df[
    (pr_credits_df["cast"].astype(str) != "[]")
    | (pr_credits_df["crew"].astype(str) != "[]")
]
print("Clean credits dataframe length: " + str(len(pr_credits_df)))

3. Parse json cols to list of dictionaries

In [None]:
credits_df_json_cols = ["cast", "crew"]
pr_credits_df = col_json_to_dict(pr_credits_df, credits_df_json_cols)
pr_credits_df.head()

4. Create the actors column

In [None]:
actors_col = "actor_name"
pr_credits_df = pr_credits_df.assign(
    **{actors_col: pr_credits_df[credits_df_json_cols[0]]}
)
pr_credits_df.head()

5. Transform the list of dictionaries in the cast column into sets with relevant values

In [None]:
key = "name"
pr_credits_df = col_dict_to_set(pr_credits_df, "cast", key)
pr_credits_df.head()

6. Filter crew dictionaries with Director as job

In [None]:
crew_col = "crew"
job_field = "job"
values = ["Director"]
pr_credits_df = col_filter_dict_with_vals(pr_credits_df, crew_col, job_field, values)
pr_credits_df = col_dict_to_set(pr_credits_df, crew_col, key)
pr_credits_df = pr_credits_df[pr_credits_df[crew_col].notna()]
pr_credits_df.head()

7. Get actors unique list

In [None]:
order_key = "order"
# Set the actor importances (the order key. The main actor is order 0) to consider for taking an actor as a node
values = [0]
get_value = lambda dict_: dict_.get(key)
sel_order_actors_df = col_filter_dict_with_vals(
    pr_credits_df, actors_col, order_key, values
)
actors_series = sel_order_actors_df[actors_col].explode()
n_nan = actors_series.isna().sum()
print("There are {:2d} missing values".format(n_nan))
actors_series = actors_series[actors_series.notna()]
actors_list = list(set(actors_series.apply(get_value).to_list()))
print("The list of actors contains {:2d} entries".format(len(actors_list)))
print(actors_list[:5])

8. Get the actors that are in the list

In [None]:
name_key = "name"
values = actors_list
pr_credits_df = col_filter_dict_with_vals(pr_credits_df, actors_col, name_key, values)
pr_credits_df = pr_credits_df.explode(actors_col)
pr_credits_df = pr_credits_df[pr_credits_df[actors_col].notna()]
actors_series = pr_credits_df[actors_col]
pr_credits_df = pr_credits_df.assign(
    **{actors_col: pr_credits_df[actors_col].apply(get_value)}
)
pr_credits_df.head()

9. Remove actors taken as nodes from cast column

In [None]:
actor_cast_col = "actor_cast"
pr_credits_df = pr_credits_df.reset_index(drop=True)
remove_from_cast = lambda row: row[credits_df_json_cols[0]] - set({row[actors_col]})
pr_credits_df = pr_credits_df.assign(
    **{credits_df_json_cols[0]: pr_credits_df.apply(remove_from_cast, axis=1)}
).drop(columns="title")
pr_credits_df.head()

10. **Check if the actor was effectively removed**

In [None]:
test_actor_name = "Sam Worthington"
test_df = pr_credits_df[pr_credits_df[actors_col] == test_actor_name]
for (idx, cast) in test_df["cast"].items():
    assert test_actor_name not in cast, "Actor taken as node should not be in cast"
print("Test passed")

## Build actors index DataFrame

1. Get actors index and most relevant features

In [None]:
actor_cols = ["name", "gender", "order"]
actors_idx_df = json_normalize(actors_series)[actor_cols]
actors_idx_df = (
    actors_idx_df.rename(columns={actor_cols[0]: actors_col})
    .groupby(actors_col)
    .aggregate({actor_cols[1]: max, actor_cols[2]: list})
    .reset_index()
    .reset_index()
    .rename(columns={"index": "actor_id"})
)
actors_idx_df.head()

In [None]:
print("The number of unique actors is: " + str(len(actors_idx_df)))

## Build actors_index-credits DataFrame
Add actor features to the credits DataFrame

In [None]:
actors_idx_credits_df = pr_credits_df.drop(new_movie_id_col, axis=1).merge(
    actors_idx_df, right_on=actors_col, left_on=actors_col, how="inner"
)
actors_idx_credits_df.head(15)

## Build movies index DataFrame
Re-index movies 

In [None]:
# movie_id_col = "movie_id"
# title_col = "title"
# new_movie_id_col = "new_movie_id"
# pr_movies_new_id_df = pr_movies_df.merge(new_movie_id_df, on=movie_id_col, how="inner")
# movies_idx_df = (
#     pr_movies_new_id_df[[movie_id_col, title_col, new_movie_id_col]].drop_duplicates()
#     #     .reset_index()
#     .rename(columns={"index": new_movie_id_col})
# )
# movies_idx_df.head()

In [None]:
# print("Number of unique movies: " + str(len(movies_idx_df)))

In [None]:
# movies_idx_csv_df = movies_idx_df.set_index(new_movie_id_col)
# movies_idx_csv_df.to_csv("movies_idx.csv")

## Build actors-movies-credits DataFrame
Get actors from each movie and append their respective credits

In [None]:
movie_id_col = "movie_id"
actors_movies_credits_df = actors_idx_credits_df.merge(
    pr_movies_df, right_on=movie_id_col, left_on=movie_id_col, how="inner"
)
actors_movies_credits_df = (
    actors_movies_credits_df.merge(
        movies_idx_df.drop(columns=title_col),
        right_on=movie_id_col,
        left_on=movie_id_col,
        how="inner",
    )
    .drop(columns=movie_id_col)
    .rename(columns={new_movie_id_col: movie_id_col})
)
actors_movies_credits_df.head()

## Build actors aggregated DataFrame

In [None]:
get_union = lambda s: reduce(set.union, s)
actor_id_col = "actor_id"
actors_agg_df = (
    actors_movies_credits_df.groupby(actors_col, as_index=False)
    .aggregate(
        {
            "movie_id": set,
            "cast": get_union,
            "crew": get_union,
            "actor_id": max,
            "gender": max,
            "budget": "mean",
            "genres": get_union,
            "keywords": get_union,
            "original_language": set,
            "popularity": "mean",
            "production_companies": get_union,
            "production_countries": get_union,
            "release_date": list,
            "revenue": "mean",
            "runtime": sum,
            "status": list,
            "title": set,
            "vote_average": "mean",
            "vote_count": "mean",
            norm_vote_avg_col: "mean",
        }
    )
    .set_index(actor_id_col)
)
actors_agg_df.head()
# Smoothness
# supervised. Python-lovain labels will be our labels for the group.
# We want to check if the features can explain the clusters
# Use features as affinity to
# logistic regresssion ads interpretability
# predict average rating for a new actor

In [None]:
actors_agg_df.to_pickle("actors_agg_df.pkl")

## Tests

1. Get the raw joined actor-movie dataframe

In [None]:
raw_actor_df = pr_credits_df.set_index("movie_id")

In [None]:
raw_actor_movie_df = raw_actor_df.merge(valid_movies_df, on=movie_id_col, how="inner")
raw_actor_movie_df.head()

2. Select 1 actor and compare two sets of movies he/she participated in. The first set is extracted from the raw actor-movie dataframe and the second from the aggregated actor dataframe. **Before testing any actor, make sure that none of the movies he/she participated in had been removed in the data cleaning process**

In [None]:
test_actor_name = "Sam Worthington"

In [None]:
from_raw_actor_movie_df = raw_actor_movie_df[[actors_col, title_col]]
from_raw_movies_performed_in = set(
    from_raw_actor_movie_df[from_raw_actor_movie_df[actors_col] == test_actor_name][
        title_col
    ]
)

In [None]:
from_agg_actor_movie_df = actors_agg_df[[actors_col, title_col]]
from_agg_movies_performed_in = from_agg_actor_movie_df[
    from_agg_actor_movie_df[actors_col] == test_actor_name
][title_col].iloc[0]

In [None]:
assert (
    from_raw_movies_performed_in == from_agg_movies_performed_in
), "Aggregated actors dataframe has missing data or movies were removed in the data cleaning process"
print("Test passed")

3. Test whether the number of entries in the actors_agg_df is equal to the number of actors

In [None]:
len(actors_agg_df)

In [None]:
assert len(actors_agg_df) == len(
    set(list(raw_actor_movie_df[actors_col]))
), "Aggregated actors dataframe has missing data"
print("Test passed")