In [25]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Loading the data

In [26]:
# reading the csv as data frames
movies = pd.read_csv('data/movies_metadata.csv')
credits = pd.read_csv('data/credits.csv')

  movies = pd.read_csv('data/movies_metadata.csv')


In [27]:
# set movie id as index
movies.set_index('id', inplace=True)

# 2. Genres

In [28]:
# extract all genres
genre_in_movie = []
for id, row in movies.iterrows():
    for g in ast.literal_eval(row.genres):
        genre_in_movie.append([g["name"], id])

In [29]:
# convert genre movie to dataframe
genre_in_movie = pd.DataFrame(genre_in_movie)
genre_in_movie.columns = ["genre", "movie_id"]

In [30]:
genre_in_movie

Unnamed: 0,genre,movie_id
0,Animation,862
1,Comedy,862
2,Family,862
3,Adventure,8844
4,Fantasy,8844
...,...,...
91101,Family,439050
91102,Drama,111109
91103,Action,67758
91104,Drama,67758


In [31]:
# only keep first gnere of each movie
genre_in_movie.drop_duplicates(subset="movie_id", keep="first", inplace=True)

In [32]:
genre_movies = genre_in_movie.groupby('genre')['movie_id'].apply(list)

In [33]:
# group genres and count number of movies + sort
sorted_genres = genre_in_movie.groupby('genre').count().rename(columns={"movie_id": "movie_count"}).sort_values(by="movie_count", ascending=False)

In [34]:
# only keep relevant genres
sorted_genres = sorted_genres[sorted_genres.values > 1]

In [35]:
genre_movies = genre_movies[sorted_genres.index]

In [36]:
genre_movies

genre
Drama              [524, 4584, 1710, 12665, 451, 16420, 17015, 37...
Comedy             [31357, 11862, 11860, 9087, 12110, 8012, 9263,...
Action             [949, 45325, 9091, 1408, 11517, 9691, 11443, 9...
Documentary        [124626, 63076, 89333, 51352, 123360, 26564, 5...
Horror             [755, 9102, 34574, 628, 56428, 92769, 9059, 11...
Crime              [5, 9273, 807, 2086, 96357, 103, 33542, 30157,...
Thriller           [99040, 8068, 48787, 79593, 18256, 61813, 1839...
Adventure          [8844, 710, 139405, 10530, 11359, 17414, 11780...
Romance            [15602, 78802, 146599, 22279, 2293, 1909, 1578...
Animation          [862, 22586, 18242, 812, 10895, 11827, 10112, ...
Fantasy            [902, 9598, 577, 27793, 8839, 11980, 58372, 43...
Science Fiction    [63, 9482, 9348, 63105, 193, 78, 8069, 62, 601...
Mystery            [20649, 8973, 26203, 23210, 426, 213, 963, 15,...
Family             [21032, 46785, 8587, 1634, 41579, 532, 110465,...
Music              [2054, 12

# 3. Actors and movies

In [37]:
# extract all actors
cast_in_movie = []
for _, r in credits.iterrows():
    for g in ast.literal_eval(r.cast):
        cast_in_movie.append([g["name"], r.id])

In [38]:
# convert actor movie to dataframe
cast_in_movie = pd.DataFrame(cast_in_movie)
cast_in_movie.columns = ["actor", "movie_id"]

In [39]:
# group actors and count number of movies + sort
sorted_actors = cast_in_movie.groupby('actor').count().rename(columns={"movie_id": "movie_count"}).sort_values(by="movie_count", ascending=False)

In [40]:
# keep only actors that were in 20 or more movies
top_actors = sorted_actors[sorted_actors.movie_count >= 20]

In [41]:
actor_movie = cast_in_movie[cast_in_movie.actor.isin(top_actors.index)]


In [42]:
def extract_ids(rows):
    def get_intersection(ids):
        # print(set(ids))
        return set(map(int, ids)).intersection(set(rows.movie_id))
    return genre_movies.apply(get_intersection)

In [43]:
actors_genres_movies = actor_movie.groupby('actor').apply(extract_ids)

In [44]:
actors_genres_movies.to_json('docs/data/actors_genres_movies.json', 'index')

In [45]:
movies["budget"] = pd.to_numeric(movies["budget"], errors='coerce')

In [46]:
movies

Unnamed: 0_level_0,adult,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
8844,False,,65000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
15602,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
31357,False,,16000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
11862,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0.0,"[{'id': 35, 'name': 'Comedy'}]",,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439050,False,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
111109,False,,0.0,"[{'id': 18, 'name': 'Drama'}]",,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
67758,False,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
227506,False,,0.0,[],,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [47]:
movies = movies[~movies.index.duplicated()]

In [48]:
movies.to_json('docs/data/movies.json', 'index')