In [1]:
import pandas as pd
import ast
import json

%load_ext autoreload
%autoreload 2

# 1. Loading the data

In [2]:
# reading the csv as data frames
movies = pd.read_csv('data/movies_metadata.csv')
credits = pd.read_csv('data/credits.csv')

  movies = pd.read_csv('data/movies_metadata.csv')


# 2. Genres

In [9]:
# extract all genres
genre_in_movie = []
for _, row in movies.iterrows():
    for g in ast.literal_eval(row.genres):
        genre_in_movie.append([g["name"], g["id"]])

In [10]:
# convert genres to dataframe
genre_in_movie = pd.DataFrame(genre_in_movie)
genre_in_movie.columns = ["genre", "id"]
genre_in_movie["count"] = 0

In [11]:
# group genres and count number of movies + sort
sorted_genres = genre_in_movie.groupby(['genre', 'id']).count().sort_values(by="id").reset_index()


In [12]:
# only keep relevant genres and change columns
sorted_genres = sorted_genres[sorted_genres['count'] > 1]
sorted_genres.drop(columns=['count'], inplace=True)
sorted_genres['color'] = '#000000'

In [13]:
# display genres by usage
genres = sorted_genres.reset_index().to_dict('records')
genres

[{'index': 0, 'genre': 'Adventure', 'id': 12, 'color': '#000000'},
 {'index': 1, 'genre': 'Fantasy', 'id': 14, 'color': '#000000'},
 {'index': 2, 'genre': 'Animation', 'id': 16, 'color': '#000000'},
 {'index': 3, 'genre': 'Drama', 'id': 18, 'color': '#000000'},
 {'index': 4, 'genre': 'Horror', 'id': 27, 'color': '#000000'},
 {'index': 5, 'genre': 'Action', 'id': 28, 'color': '#000000'},
 {'index': 6, 'genre': 'Comedy', 'id': 35, 'color': '#000000'},
 {'index': 7, 'genre': 'History', 'id': 36, 'color': '#000000'},
 {'index': 8, 'genre': 'Western', 'id': 37, 'color': '#000000'},
 {'index': 9, 'genre': 'Thriller', 'id': 53, 'color': '#000000'},
 {'index': 10, 'genre': 'Crime', 'id': 80, 'color': '#000000'},
 {'index': 11, 'genre': 'Documentary', 'id': 99, 'color': '#000000'},
 {'index': 12, 'genre': 'Science Fiction', 'id': 878, 'color': '#000000'},
 {'index': 17, 'genre': 'Mystery', 'id': 9648, 'color': '#000000'},
 {'index': 18, 'genre': 'Music', 'id': 10402, 'color': '#000000'},
 {'ind

In [14]:
# Define genre colors (partially from https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=12)
genres = [
    {'genre': 'Adventure', 'id': 12, 'color': '#fdbf6f'},
    {'genre': 'Action', 'id': 28, 'color': '#ff7f00'},
    {'genre': 'Romance', 'id': 10749, 'color': '#fb9a99'},
    {'genre': 'Drama', 'id': 18, 'color': '#e31a1c'},
    {'genre': 'Fantasy', 'id': 14, 'color': '#a6cee3'},
    {'genre': 'Science Fiction', 'id': 878, 'color': '#1f78b4'},
    {'genre': 'Animation', 'id': 16, 'color': '#ffff99'},
    {'genre': 'Comedy', 'id': 35, 'color': '#b2df8a'},
    {'genre': 'Family', 'id': 10751, 'color': '#33a02c'},
    {'genre': 'Horror', 'id': 27, 'color': '#5e7563'},
    {'genre': 'Thriller', 'id': 53, 'color': '#2f3b32'},
    {'genre': 'Mystery', 'id': 9648, 'color': '#b816b8'},
    {'genre': 'Crime', 'id': 80, 'color': '#cab2d6'},
    {'genre': 'Western', 'id': 37, 'color': '#b15928'},
    {'genre': 'War', 'id': 10752, 'color': '#6a3d9a'},
    {'genre': 'Foreign', 'id': 10769, 'color': '#820096'},
    {'genre': 'History', 'id': 36, 'color': '#300008'},
    {'genre': 'Music', 'id': 10402, 'color': '#e5f53b'},
    {'genre': 'Documentary', 'id': 99, 'color': '#1403a6'},
    {'genre': 'TV Movie', 'id': 10770, 'color': '#8a6629'}
]

# 3. Actors and movies

In [15]:
# extract all actors
cast_in_movie = []
for _, r in credits.iterrows():
    for g in ast.literal_eval(r['cast']):
        cast_in_movie.append([g["id"], g["name"], g['gender'], g['profile_path'], r['id']])

In [16]:
# convert actor movie to dataframe
cast_in_movie = pd.DataFrame(cast_in_movie)
cast_in_movie.columns = ["id", "name", 'gender', 'profile_path', 'movie_id']
cast_in_movie.movie_id = cast_in_movie.movie_id.astype('string')

In [17]:
# Aggregate movie ids
actors = cast_in_movie.groupby(['name', 'id', 'gender', 'profile_path'])['movie_id'].apply(list).reset_index().rename(columns={'movie_id': 'movies'})

In [18]:
# Some ids are not unique (different name, gender, profile_path
actors['id'].unique().shape

(77372,)

In [19]:
top_actors = actors[actors.movies.map(len) >= 20]

In [20]:
top_actors.shape

(3680, 5)

In [21]:
# number of movies 
movies.shape

(45466, 24)

In [22]:
# get ids of the movies of the top actors
top_actors_movies_ids = set()
for ids in top_actors.movies: top_actors_movies_ids.update(ids)

In [63]:
top_actors_movies = movies[movies['id'].isin(map(str,top_actors_movies_ids))]


In [64]:
# display filtered number of movies
top_actors_movies.shape

(29649, 24)

In [65]:
top_actors_movies.drop(columns=['adult', 'belongs_to_collection', 'homepage', 'original_language', 'original_title', 'production_companies', 'production_countries', 'spoken_languages', 'status', 'video', 'overview'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_actors_movies.drop(columns=['adult', 'belongs_to_collection', 'homepage', 'original_language', 'original_title', 'production_companies', 'production_countries', 'spoken_languages', 'status', 'video', 'overview'], inplace=True)


In [66]:
top_actors_movies.head()

Unnamed: 0,budget,genres,id,imdb_id,popularity,poster_path,release_date,revenue,runtime,tagline,title,vote_average,vote_count
0,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373554033.0,81.0,,Toy Story,7.7,5415.0
1,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262797249.0,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81452156.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76578911.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


In [67]:
genre_ids = list(sorted_genres.id)
def get_ids(genres_str):
    return [g['id'] for g in ast.literal_eval(genres_str) if g['id'] in genre_ids]

In [69]:
top_actors_movies.genres = top_actors_movies.genres.map(get_ids)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_actors_movies.genres = top_actors_movies.genres.map(get_ids)


In [70]:
top_actors_movies.head()

Unnamed: 0,budget,genres,id,imdb_id,popularity,poster_path,release_date,revenue,runtime,tagline,title,vote_average,vote_count
0,30000000,"[16, 35, 10751]",862,tt0114709,21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1995-10-30,373554033.0,81.0,,Toy Story,7.7,5415.0
1,65000000,"[12, 14, 10751]",8844,tt0113497,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,1995-12-15,262797249.0,104.0,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0
2,0,"[10749, 35]",15602,tt0113228,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,1995-12-22,0.0,101.0,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0
3,16000000,"[35, 18, 10749]",31357,tt0114885,3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1995-12-22,81452156.0,127.0,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0
4,0,[35],11862,tt0113041,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,1995-02-10,76578911.0,106.0,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0


# 4. Write data to json

In [71]:
data = {
    "actors": top_actors.fillna('').reset_index().to_dict('records'),
    "movies": top_actors_movies.fillna('').reset_index().to_dict('records'),
    "genres": genres
}

with open('data/data.json', 'w') as f:
    json.dump(data, f, indent=4)