# Explore IMDb actor dataframe

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from pandas.io.json import json_normalize
from project_utils import *

%load_ext autoreload
%autoreload 2

## Import data

In [None]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

In [None]:
credits_df = pd.read_csv("./data/tmdb_5000_credits.csv", sep=",", quotechar='"')

In [None]:
# - movies with empty genre
# - movies with no production company
# - movies with no production country
# - movies with no spoken languages
ill_condition = (
    (movies_df.astype(str)["genres"] == "[]")
    | (movies_df.astype(str)["production_companies"] == "[]")
    | (movies_df.astype(str)["production_countries"] == "[]")
    | (movies_df.astype(str)["spoken_languages"] == "[]")
    | (movies_df["runtime"] == 0)
    | (movies_df["revenue"] == 0)
    | (movies_df["budget"] == 0)
)
ill_conditioned_movies_df = movies_df[ill_condition][
    [
        "title",
        "genres",
        "production_companies",
        "production_countries",
        "spoken_languages",
        "runtime",
        "revenue",
        "budget",
    ]
]
print(len(ill_conditioned_movies_df))
ill_conditioned_movies_df.head()

In [None]:
# movies with budget 0
no_revenue_df = movies_df[movies_df["budget"] == 0][["genres", "revenue", "title"]]
print(len(no_revenue_df))
display(no_revenue_df)

## Explore data

In [None]:
list(credits_df.columns)

In [None]:
list(movies_df.columns)

In [None]:
credits_df.isna().sum(axis=0)

In [None]:
movies_df.isna().sum(axis=0)

In [None]:
movies_df.dtypes

In [None]:
credits_df.dtypes

### Explore json fields

### Most relevant jobs of the crew
- Creator
- Director
- Producer
- Co-Director
- Editor
- Executive Producer
- Co-Executive Producer
- Co-Producer
- Script Editor
- Executive In Charge Of Production
- Author
- Dialogue Editor
- Script
- Original Story

In [None]:
values, _ = get_json_values_from_col(credits_df["crew"], "job")
values[:20]

In [None]:
values, _ = get_json_values_from_col(credits_df["crew"], "gender")
values

In [None]:
values, _ = get_json_values_from_col(movies_df["genres"], "name")
values

In [None]:
values, _ = get_json_values_from_col(movies_df["keywords"], "name")
values[:20]

In [None]:
values, _ = get_json_values_from_col(movies_df["production_countries"], "name")
values[:20]

In [None]:
values, _ = get_json_values_from_col(movies_df["spoken_languages"], "name")
values[:20]

### Explore json keys

In [None]:
keys, _ = get_json_keys_from_col(credits_df["cast"])
keys

In [None]:
keys, _ = get_json_keys_from_col(credits_df["crew"])
keys

In [None]:
keys, _ = get_json_keys_from_col(movies_df["genres"])
keys

In [None]:
keys, _ = get_json_keys_from_col(movies_df["keywords"])
keys

In [None]:
keys, _ = get_json_keys_from_col(movies_df["keywords"])
keys

In [None]:
keys, _ = get_json_keys_from_col(movies_df["production_companies"])
keys

In [None]:
keys, _ = get_json_keys_from_col(movies_df["production_countries"])
keys

In [None]:
keys, _ = get_json_keys_from_col(movies_df["spoken_languages"])
keys