# Explore IMDb movies dataset

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from pandas.io.json import json_normalize
from project_utils import *

%reload_ext lab_black
%load_ext autoreload
%autoreload 2

## Import data

In [None]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

In [None]:
movies_df.columns

In [None]:
credits_df = pd.read_csv("./data/tmdb_5000_credits.csv", sep=",", quotechar='"')

In [None]:
credits_df.columns

## Explore movies data

In [None]:
movies_df.describe()

### Numerical features exploration
Plot the histograms of the numerical features to indentify outliers

In [None]:
np.log10(np.min(movies_df["budget"]) + 1e-6)

In [None]:
col = movies_df["budget"]
plot_hist(col, "Budget histogram", "budget [$us] (log10 scale)", "count", log=True)

In [None]:
col = movies_df["popularity"]
plot_hist(col, "Popularity histogram", "popularity (log10 scale)", "count", log=True)

In [None]:
col = movies_df["revenue"]
plot_hist(col, "Revenue histogram", "revenue (log10 scale)", "count", log=True)

In [None]:
col = movies_df["vote_average"]
plot_hist(col, "Vote average histogram", "vote_average", "count", log=False, bins=105)

In [None]:
col = movies_df["vote_count"]
plot_hist(col, "Vote count histogram", "vote_count (log10 scale)", "count", log=True)

### Number of movies to remove

In [None]:
cond_to_remove = (
    (movies_df["budget"] == 0)
    | (movies_df["popularity"] == 0)
    | (movies_df["revenue"] == 0)
    | (movies_df["runtime"] == 0)
    | (movies_df["vote_average"] == 0)
    | (movies_df["vote_count"] == 0)
)
to_remove_movies_df = movies_df[cond_to_remove][
    [
        "title",
        "genres",
        "production_companies",
        "budget",
        "popularity",
        "revenue",
        "runtime",
        "vote_average",
        "vote_count",
    ]
]
print(("Number of movies to remove: " + str(len(to_remove_movies_df))))
to_remove_movies_df.head()

### Categorical features exploration

In [None]:
values, _ = get_json_values_from_col(movies_df["genres"], "name")
values

In [None]:
values, _ = get_json_values_from_col(movies_df["production_companies"], "name")
values[:20]

In [None]:
values, _ = get_json_values_from_col(movies_df["keywords"], "name")
values[:20]

### Time features exploration

In [None]:
movies_df["release_date"] = movies_df["release_date"].astype("datetime64")

In [None]:
fig, ax = plt.subplots(figsize=(20, 5))
years = movies_df["release_date"].dt.year
years = years[years.notna()]
tick_labels = years.unique().astype(int)
movies_df["release_date"].groupby([movies_df["release_date"].dt.year]).count().plot(
    ax=ax, kind="bar", grid=True, width=1, tick_label=tick_labels,
)

## Explore credits data

In [None]:
list(credits_df.columns)

### Explore json fields

#### Explore json keys

In [None]:
col = credits_df["cast"]
values, _ = get_json_keys_from_col(col)
values[:20]

In [None]:
col = credits_df["crew"]
values, _ = get_json_keys_from_col(col)
values[:20]

In [None]:
values, _ = get_json_values_from_col(credits_df["crew"], "job")
values[:20]

#### Most relevant jobs of the crew
Manually selected from the whole list of jobs
- Director
- Co-Director
- Creator
- Producer
- Editor
- Executive Producer
- Co-Executive Producer
- Co-Producer
- Script Editor
- Executive In Charge Of Production
- Author
- Dialogue Editor
- Script
- Original Story

In [None]:
values, _ = get_json_values_from_col(credits_df["crew"], "gender")
values