In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval

pd.options.mode.chained_assignment = None

In [2]:
data_path = "./data"
if os.listdir(data_path) != ['tmdb_5000_credits.csv', 'tmdb_5000_movies.csv']:
    print("[ERROR] Please download and unzip the dataset in a subdirectory './data'.")
else:
    print("[INFO] The dataset is correctly placed.")

[INFO] The dataset is correctly placed.


### Loading the data

In [47]:
tmdb_credits = pd.read_csv(os.path.join(data_path, "tmdb_5000_credits.csv"))
tmdb_credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [48]:
tmdb_movies = pd.read_csv(os.path.join(data_path, "tmdb_5000_movies.csv"))
tmdb_movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [49]:
def get_first_genre(genres):
    L=[]
    for genre in genres:
        L.append(genre['name'])
    if len(L) == 0:
        return np.nan
    return L[0]

In [50]:
movie_genres = tmdb_movies[["genres", "release_date","original_title","overview","popularity", "tagline", "vote_average","vote_count"]]
movie_genres["genres"] = movie_genres.genres.apply(lambda x: get_first_genre(literal_eval(x)))
movie_genres.dropna(inplace=True)
movie_genres.head()

Unnamed: 0,genres,release_date,original_title,overview,popularity,tagline,vote_average,vote_count
0,Action,2009-12-10,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,Enter the World of Pandora.,7.2,11800
1,Adventure,2007-05-19,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"At the end of the world, the adventure begins.",6.9,4500
2,Action,2015-10-26,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,A Plan No One Escapes,6.3,4466
3,Action,2012-07-16,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,The Legend Ends,7.6,9106
4,Action,2012-03-07,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"Lost in our world, found in another.",6.1,2124


In [51]:
movie_genres.sort_values("popularity", ascending=False, inplace=True)

In [57]:
df = movie_genres.groupby('genres').head()
df

Unnamed: 0,genres,release_date,original_title,overview,popularity,tagline,vote_average,vote_count
546,Family,2015-06-17,Minions,"Minions Stuart, Kevin and Bob are recruited by...",875.581305,"Before Gru, they had a history of bad bosses",6.4,4571
95,Adventure,2014-11-05,Interstellar,Interstellar chronicles the adventures of a gr...,724.247784,Mankind was born on Earth. It was never meant ...,8.1,10867
788,Action,2016-02-09,Deadpool,Deadpool tells the origin story of former Spec...,514.569956,Witness the beginning of a happy ending,7.4,10995
94,Action,2014-07-30,Guardians of the Galaxy,"Light years from Earth, 26 years after being a...",481.098624,All heroes start somewhere.,7.9,9742
127,Action,2015-05-13,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,434.278564,What a Lovely Day.,7.2,9427
...,...,...,...,...,...,...,...,...
4096,Documentary,2010-10-08,Inside Job,A film that exposes the shocking truth behind ...,16.930914,"The film that cost $20,000,000,000,000 to make.",7.7,286
3446,Documentary,2004-06-25,Fahrenheit 9/11,Michael Moore's view on what happened to the U...,16.209850,Controversy... what controversy?,6.8,397
4356,Documentary,2008-09-07,"Food, Inc.",Documentary filmmaker Robert Kenner examines h...,15.017162,You'll never look at dinner the same way again.,7.4,217
3557,Documentary,2002-10-21,Jackass: The Movie,Johnny Knoxville and his crazy friends appear ...,14.752587,Do not attempt this at home.,6.1,345


In [74]:
column_names = {v: k for k, v in zip(df.columns, range(1, len(df.columns)+1))}    
print(column_names)

{1: 'genres', 2: 'release_date', 3: 'original_title', 4: 'overview', 5: 'popularity', 6: 'tagline', 7: 'vote_average', 8: 'vote_count'}


In [75]:
from collections import defaultdict
results = defaultdict(lambda: defaultdict(dict))

for value in df.itertuples():
    if(value[1] not in results.keys()):
        results[value[1]]=[]
    elem = {}
    for a,v in enumerate(value):
        if(a!=0):
            elem[column_names[a]] = v 
    results[value[1]].append(elem)

In [76]:
results

defaultdict(<function __main__.<lambda>()>,
            {'Family': [{'genres': 'Family',
               'release_date': '2015-06-17',
               'original_title': 'Minions',
               'overview': 'Minions Stuart, Kevin and Bob are recruited by Scarlet Overkill, a super-villain who, alongside her inventor husband Herb, hatches a plot to take over the world.',
               'popularity': 875.581305,
               'tagline': 'Before Gru, they had a history of bad bosses',
               'vote_average': 6.4,
               'vote_count': 4571},
              {'genres': 'Family',
               'release_date': '1994-06-23',
               'original_title': 'The Lion King',
               'overview': "A young lion cub named Simba can't wait to be king. But his uncle craves the title for himself and will stop at nothing to get it.",
               'popularity': 90.457886,
               'tagline': "Life's greatest adventure is finding your place in the Circle of Life.",
            

In [77]:
import json
with open('most_popular_movies.json', 'w') as fp:
    json.dump(results, fp)