In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval

pd.options.mode.chained_assignment = None

In [2]:
data_path = "./data"
if os.listdir(data_path) != ['tmdb_5000_credits.csv', 'tmdb_5000_movies.csv']:
    print("[ERROR] Please download and unzip the dataset in a subdirectory './data'.")
else:
    print("[INFO] The dataset is correctly placed.")

[INFO] The dataset is correctly placed.


### Loading the data

In [3]:
tmdb_credits = pd.read_csv(os.path.join(data_path, "tmdb_5000_credits.csv"))
tmdb_credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [4]:
tmdb_movies = pd.read_csv(os.path.join(data_path, "tmdb_5000_movies.csv"))
tmdb_movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [5]:
def get_first_pc(pcs):
    L=[]
    for pc in pcs:
        L.append(pc['name'])
    if len(L) == 0:
        return np.nan
    return L[0]

In [6]:
movie_pcs = tmdb_movies[["production_companies","genres","id","release_date","original_title","overview","popularity", "tagline", "vote_average","vote_count"]]
movie_pcs["production_companies"] = movie_pcs.production_companies.apply(lambda x: get_first_pc(literal_eval(x)))
movie_pcs.dropna(inplace=True)
movie_pcs.head()

Unnamed: 0,production_companies,genres,id,release_date,original_title,overview,popularity,tagline,vote_average,vote_count
0,Ingenious Film Partners,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,2009-12-10,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,Enter the World of Pandora.,7.2,11800
1,Walt Disney Pictures,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,2007-05-19,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"At the end of the world, the adventure begins.",6.9,4500
2,Columbia Pictures,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,2015-10-26,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,A Plan No One Escapes,6.3,4466
3,Legendary Pictures,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,2012-07-16,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,The Legend Ends,7.6,9106
4,Walt Disney Pictures,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,2012-03-07,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"Lost in our world, found in another.",6.1,2124


In [7]:
def clean_pc(pc):
    pc.loc[pc['production_companies'].str.startswith("Walt Disney"), "production_companies"] = "Walt Disney"
    pc.loc[pc['production_companies'].str.startswith("Paramount"), "production_companies"] = "Paramount Pictures"
    pc.loc[pc['production_companies'].str.startswith("Universal"), "production_companies"] = "Universal Pictures"
    pc.loc[pc['production_companies'].str.startswith("Twentieth"), "production_companies"] = "20th Century Studios"
    pc.loc[pc['production_companies'].str.startswith("20th"), "production_companies"] = "20th Century Studios"
    pc.loc[pc['production_companies'].str.startswith("Columbia Pictures"), "production_companies"] = "Columbia Pictures"
    pc.loc[pc['production_companies'].str.startswith("New Line"), "production_companies"] = "New Line Cinema"
    pc.loc[pc['production_companies'].str.startswith("MGM"), "production_companies"] = "Metro-Goldwyn-Mayer (MGM)"
    pc.loc[pc['production_companies'].str.startswith("Metro-Goldwyn"), "production_companies"] = "Metro-Goldwyn-Mayer (MGM)"
    pc.loc[pc['production_companies'].str.startswith("Warner"), "production_companies"] = "Warner Bros."
    return pc

In [8]:
movie_psc = clean_pc(movie_pcs)

In [9]:
movie_pcs.sort_values("popularity", ascending=False, inplace=True)

In [10]:
df = movie_pcs.groupby('production_companies').head()
df

Unnamed: 0,production_companies,genres,id,release_date,original_title,overview,popularity,tagline,vote_average,vote_count
546,Universal Pictures,"[{""id"": 10751, ""name"": ""Family""}, {""id"": 16, ""...",211672,2015-06-17,Minions,"Minions Stuart, Kevin and Bob are recruited by...",875.581305,"Before Gru, they had a history of bad bosses",6.4,4571
95,Paramount Pictures,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 18, ""...",157336,2014-11-05,Interstellar,Interstellar chronicles the adventures of a gr...,724.247784,Mankind was born on Earth. It was never meant ...,8.1,10867
788,20th Century Studios,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",293660,2016-02-09,Deadpool,Deadpool tells the origin story of former Spec...,514.569956,Witness the beginning of a happy ending,7.4,10995
94,Marvel Studios,"[{""id"": 28, ""name"": ""Action""}, {""id"": 878, ""na...",118340,2014-07-30,Guardians of the Galaxy,"Light years from Earth, 26 years after being a...",481.098624,All heroes start somewhere.,7.9,9742
127,Village Roadshow Pictures,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",76341,2015-05-13,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,434.278564,What a Lovely Day.,7.2,9427
...,...,...,...,...,...,...,...,...,...,...
4774,Strand Releasing,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",42497,1971-01-01,Pink Narcissus,An erotic poem set in the fantasies of a young...,0.027811,A unique experience in visual fantasy!,6.0,9
4638,Daniel Columbie Films & Productions,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 28, ""name...",378237,2014-12-01,Amidst the Devil's Wings,"Prequel to ""5th of a Degree.""",0.018087,"Prequel to ""5th of a Degree.""",0.0,0
4051,After Eden Pictures,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 18, ""n...",433715,2014-06-15,8 Days,"After sneaking to a party with her friends, 16...",0.015295,She never knew it could happen to her...,0.0,0
3967,Indiana Production Company,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...",79587,2009-01-15,Four Single Fathers,A comedy/drama about four Italian single fathe...,0.008263,Comedy,0.0,0


In [11]:
column_names = {v: k for k, v in zip(df.columns, range(1, len(df.columns)+1))}    
print(column_names)

{1: 'production_companies', 2: 'genres', 3: 'id', 4: 'release_date', 5: 'original_title', 6: 'overview', 7: 'popularity', 8: 'tagline', 9: 'vote_average', 10: 'vote_count'}


In [12]:
from collections import defaultdict
results = defaultdict(lambda: defaultdict(dict))

for value in df.itertuples():
    if(value[1] not in results.keys()):
        results[value[1]]=[]
    elem = {}
    for a,v in enumerate(value):
        if(a!=0):
            elem[column_names[a]] = v 
    results[value[1]].append(elem)

In [13]:
results

defaultdict(<function __main__.<lambda>()>,
            {'Universal Pictures': [{'production_companies': 'Universal Pictures',
               'genres': '[{"id": 10751, "name": "Family"}, {"id": 16, "name": "Animation"}, {"id": 12, "name": "Adventure"}, {"id": 35, "name": "Comedy"}]',
               'id': 211672,
               'release_date': '2015-06-17',
               'original_title': 'Minions',
               'overview': 'Minions Stuart, Kevin and Bob are recruited by Scarlet Overkill, a super-villain who, alongside her inventor husband Herb, hatches a plot to take over the world.',
               'popularity': 875.581305,
               'tagline': 'Before Gru, they had a history of bad bosses',
               'vote_average': 6.4,
               'vote_count': 4571},
              {'production_companies': 'Universal Pictures',
               'genres': '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 878, "name": "Science Fiction"}, {"id": 53, "name": "Thrille

In [14]:
import json
with open('most_popular_movies_per_pc.json', 'w') as fp:
    json.dump(results, fp)