# Explore IMDb actor dataframe

In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
from pandas.io.json import json_normalize
from project_utils import *

%load_ext autoreload
%autoreload 2

## Import data

In [2]:
movies_df = pd.read_csv("./data/tmdb_5000_movies.csv", sep=",", quotechar='"').rename(
    {"id": "movie_id"}, axis=1
)
movies_df.head()

Unnamed: 0,budget,genres,homepage,movie_id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [3]:
credits_df = pd.read_csv("./data/tmdb_5000_credits.csv", sep=",", quotechar='"')

In [4]:
# - movies with empty genre
# - movies with no production company
# - movies with no production country
# - movies with no spoken languages
ill_condition = (
    (movies_df.astype(str)["genres"] == "[]")
    | (movies_df.astype(str)["production_companies"] == "[]")
    | (movies_df.astype(str)["production_countries"] == "[]")
    | (movies_df.astype(str)["spoken_languages"] == "[]")
    | (movies_df["runtime"] == 0)
    | (movies_df["revenue"] == 0)
    | (movies_df["budget"] == 0)
)
ill_conditioned_movies_df = movies_df[ill_condition][
    [
        "title",
        "genres",
        "production_companies",
        "production_countries",
        "spoken_languages",
        "runtime",
        "revenue",
        "budget",
    ]
]
print(len(ill_conditioned_movies_df))
ill_conditioned_movies_df.head()

1620


Unnamed: 0,title,genres,production_companies,production_countries,spoken_languages,runtime,revenue,budget
83,The Lovers,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""name"": ""Corsan"", ""id"": 7299}, {""name"": ""Bli...","[{""iso_3166_1"": ""AU"", ""name"": ""Australia""}, {""...","[{""iso_639_1"": ""en"", ""name"": ""English""}]",109.0,0,27000000
135,The Wolfman,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 27, ""name...","[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""iso_639_1"": ""en"", ""name"": ""English""}]",102.0,0,150000000
265,The Cat in the Hat,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 14, ""nam...","[{""name"": ""Imagine Entertainment"", ""id"": 23}, ...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""iso_639_1"": ""en"", ""name"": ""English""}]",82.0,0,0
272,Town & Country,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...","[{""name"": ""New Line Cinema"", ""id"": 12}]",[],"[{""iso_639_1"": ""en"", ""name"": ""English""}]",104.0,10372291,90000000
309,Son of the Mask,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 35, ""na...","[{""name"": ""New Line Cinema"", ""id"": 12}, {""name...","[{""iso_3166_1"": ""DE"", ""name"": ""Germany""}, {""is...","[{""iso_639_1"": ""en"", ""name"": ""English""}]",94.0,0,84000000


In [5]:
# movies with budget 0
no_revenue_df = movies_df[movies_df["budget"] == 0][["genres", "revenue", "title"]]
print(len(no_revenue_df))
display(no_revenue_df)

1037


Unnamed: 0,genres,revenue,title
265,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 14, ""nam...",0,The Cat in the Hat
321,"[{""id"": 35, ""name"": ""Comedy""}]",104907746,The Campaign
359,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 16, ""...",233755553,Alvin and the Chipmunks: The Road Chip
406,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 16, ""name...",0,Arthur Christmas
409,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10402, ""n...",37823676,All That Jazz
...,...,...,...
4795,"[{""id"": 18, ""name"": ""Drama""}]",0,Bang
4797,"[{""id"": 10769, ""name"": ""Foreign""}, {""id"": 53, ...",0,Cavite
4800,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",0,"Signed, Sealed, Delivered"
4801,[],0,Shanghai Calling


## Explore data

In [6]:
list(credits_df.columns)

['movie_id', 'title', 'cast', 'crew']

In [7]:
list(movies_df.columns)

['budget',
 'genres',
 'homepage',
 'movie_id',
 'keywords',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count']

In [8]:
credits_df.isna().sum(axis=0)

movie_id    0
title       0
cast        0
crew        0
dtype: int64

In [9]:
movies_df.isna().sum(axis=0)

budget                     0
genres                     0
homepage                3091
movie_id                   0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [10]:
movies_df.dtypes

budget                    int64
genres                   object
homepage                 object
movie_id                  int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
dtype: object

In [11]:
credits_df.dtypes

movie_id     int64
title       object
cast        object
crew        object
dtype: object

### Explore json fields

### Most relevant jobs of the crew
- Creator
- Director
- Producer
- Co-Director
- Editor
- Executive Producer
- Co-Executive Producer
- Co-Producer
- Script Editor
- Executive In Charge Of Production
- Author
- Dialogue Editor
- Script
- Original Story

In [12]:
values, _ = get_json_values_from_col(credits_df["crew"], "job")
values[:20]

['Hair Supervisor',
 'Second Unit Cinematographer',
 '2D Artist',
 'Color Designer',
 'Dialogue',
 'Co-Costume Designer',
 'Co-Editor',
 'Public Relations',
 'Story',
 'Special Effects Coordinator',
 'Storyboard Designer',
 'Animation Director',
 'Orchestrator',
 'Picture Car Coordinator',
 'Visual Effects Design Consultant',
 'Executive In Charge Of Production',
 'Theatre Play',
 'ADR Editor',
 'Additional Dialogue',
 'Transportation Captain']

In [13]:
values, _ = get_json_values_from_col(credits_df["crew"], "gender")
values

[0, 1, 2]

In [14]:
values, _ = get_json_values_from_col(movies_df["genres"], "name")
values

['Action',
 'History',
 'Adventure',
 'Mystery',
 'Music',
 'TV Movie',
 'Romance',
 'Foreign',
 'Animation',
 'Comedy',
 'Documentary',
 'Drama',
 'Crime',
 'Fantasy',
 'Horror',
 'Family',
 'War',
 'Science Fiction',
 'Thriller',
 'Western']

In [15]:
values, _ = get_json_values_from_col(movies_df["keywords"], "name")
values[:20]

['israeli',
 'johannesburg',
 'dvd',
 'h. r. giger',
 'prehistoric times',
 'olympic athlete',
 'war correspondent',
 'drug use',
 'sailboat',
 'film industry',
 'carousel ',
 'merlin',
 'solidarity',
 'guerrilla warfare',
 'exploding airplane',
 'cole younger',
 'mother daughter relationship',
 'attractiveness',
 'crew',
 'tolerance']

In [16]:
values, _ = get_json_values_from_col(movies_df["production_countries"], "name")
values[:20]

['Czech Republic',
 'Mexico',
 'Singapore',
 'Colombia',
 'Libyan Arab Jamahiriya',
 'Monaco',
 'Iceland',
 'Philippines',
 'Germany',
 'United States of America',
 'Japan',
 'Brazil',
 'Hungary',
 'Panama',
 'Kazakhstan',
 'Ecuador',
 'Dominican Republic',
 'Bolivia',
 'Fiji',
 'China']

In [17]:
values, _ = get_json_values_from_col(movies_df["spoken_languages"], "name")
values[:20]

['',
 'Nederlands',
 'svenska',
 'Eesti',
 'Español',
 'اردو',
 'Latin',
 '한국어/조선말',
 'Magyar',
 'پښتو',
 'ქართული',
 'Cymraeg',
 'Český',
 'No Language',
 'Português',
 'Hrvatski',
 'Wolof',
 'עִבְרִית',
 '??????',
 'Italiano']

### Explore json keys

In [18]:
keys, _ = get_json_keys_from_col(credits_df["cast"])
keys

['order', 'name', 'cast_id', 'id', 'credit_id', 'gender', 'character']

In [19]:
keys, _ = get_json_keys_from_col(credits_df["crew"])
keys

['name', 'id', 'credit_id', 'gender', 'job', 'department']

In [20]:
keys, _ = get_json_keys_from_col(movies_df["genres"])
keys

['name', 'id']

In [21]:
keys, _ = get_json_keys_from_col(movies_df["keywords"])
keys

['name', 'id']

In [22]:
keys, _ = get_json_keys_from_col(movies_df["keywords"])
keys

['name', 'id']

In [23]:
keys, _ = get_json_keys_from_col(movies_df["production_companies"])
keys

['name', 'id']

In [24]:
keys, _ = get_json_keys_from_col(movies_df["production_countries"])
keys

['name', 'iso_3166_1']

In [25]:
keys, _ = get_json_keys_from_col(movies_df["spoken_languages"])
keys

['name', 'iso_639_1']