In [2]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading the data

In [3]:
# reading the csv as data frames
movies = pd.read_csv('data/movies_metadata.csv')
credits = pd.read_csv('data/credits.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
# set movie id as index
movies.set_index('id', inplace=True)

In [5]:
# extract all actors
cast_in_movie = []
for _, r in credits.iterrows():
    for g in ast.literal_eval(r.cast):
        cast_in_movie.append([g["name"], r["id"]])

In [6]:
# convert actor movie to dataframe
cast_in_movie = pd.DataFrame(cast_in_movie)
cast_in_movie.columns = ["actor", "movie_id"]

In [7]:
# group actors and count number of movies + sort
sorted_actors = cast_in_movie.groupby('actor').count().rename(columns={"movie_id": "movie_count"}).sort_values(by="movie_count", ascending=False)

In [8]:
# keep only actors that were in 20 or more movies
top_actors = sorted_actors[sorted_actors.movie_count >= 20]

In [9]:
actor_movie = cast_in_movie[cast_in_movie.actor.isin(top_actors.index)]

In [10]:
actor_movie.movie_id.dtype

dtype('int64')

In [11]:
# remove bad indices
movies = movies[~movies.index.str.contains('-')]

In [12]:
movies.index = movies.index.astype('int64')

In [13]:
actor_movie = actor_movie.merge(movies, left_on="movie_id", right_index=True, how="inner")

In [14]:
actor_movie.budget = actor_movie.budget.astype('float64')

In [15]:
actor_movie.columns

Index(['actor', 'movie_id', 'adult', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [16]:
actor_stats = actor_movie.groupby('actor').agg({'title': 'count','vote_average': ['mean'], 'budget': ['mean', 'sum'], 'revenue': ['mean', 'sum']})

In [17]:
actor_stats.columns = ['_'.join(col).strip() for col in actor_stats.columns.values]

In [18]:
actor_stats.columns = ['Total movies', 'Average rating', 'Average budget', 'Total budget', 'Average revenue', 'Total revenue',]

In [19]:
def format_money(value):
    if (value >= 1e9):
        return '$' + str("{:.2f}").format(value/1e9) + 'B'
    elif (value >= 1e6):
        return '$' + str("{:.2f}").format(value/1e6) + 'M'
    elif (value >= 1e3):
        return '$' + str("{:.2f}").format(value/1e3) + 'K'
    else: return '$' + str("{:.2f}").format(value)


def format_number(value):
    return str("{:.2f}").format(value)

In [20]:

actor_stats['Average rating'] = actor_stats['Average rating'].apply(format_number)
actor_stats['Average budget'] = actor_stats['Average budget'].apply(format_money)
actor_stats['Total budget'] = actor_stats['Total budget'].apply(format_money)
actor_stats['Average revenue'] = actor_stats['Average revenue'].apply(format_money)
actor_stats['Total revenue'] = actor_stats['Total revenue'].apply(format_money)

In [21]:
actor_stats.to_json('docs/data/actors_stats.json', orient='index')

In [171]:
actor_movie.release_date = pd.to_datetime(actor_movie.release_date)

In [183]:
actors_releases_per_year = actor_movie.groupby(['actor',actor_movie.release_date.dt.year]).agg({'title': 'count'})

In [184]:
actors_releases_per_year

Unnamed: 0_level_0,Unnamed: 1_level_0,title
actor,release_date,Unnamed: 2_level_1
'Snub' Pollard,1915.0,1
'Snub' Pollard,1916.0,1
'Snub' Pollard,1919.0,2
'Snub' Pollard,1932.0,1
'Snub' Pollard,1934.0,1
...,...,...
Моррис Честнат,2005.0,1
Моррис Честнат,2007.0,2
Моррис Честнат,2009.0,1
Моррис Честнат,2013.0,4


In [186]:
actors_releases_per_year.to_json('docs/data/actors_releases_per_year.json', orient='table')