In [None]:
raise SystemExit("Stop right there!")

In [1]:
import requests
import json
import pandas as pd
from IPython.display import clear_output
from time import sleep
import numpy as np

In [2]:
# retrieve my TMDb key
local_file = 'tmdb_key.txt'
with open(local_file, 'r') as api_file:
    my_key = api_file.read().rstrip('\r\n')
    api_file.close()

In [3]:
# set first part of URL for every request
base_url = 'https://api.themoviedb.org/3/'

In [4]:
def make_request(_id, prior_attempts=0):
    """Makes request for movie object in TMDb database and returns it as JSON"""
    if prior_attempts == 3:
        return {}
    url = base_url + 'movie/' + str(_id) + '?api_key=' + my_key + '&append_to_response=credits,reviews'
    response = requests.get(url)
    # entry not found
    if response.status_code == 404:
        return {}
    # exceed rate limit
    if response.status_code == 429:
        sleep(10)
        return make_request(_id=_id)
    # success
    if response.status_code == 200:
        return response.json()
    else:
        sleep(1)
        return make_request(_id=_id, prior_attempts=prior_attempts + 1)

In [5]:
def get_movies(num):
    """Makes 100,000 requests for movie objects in TMDb database and stores them in a list of JSON's"""
    movies_list = []
    for val in range(num - 99999, num + 1):
        print(val)
        clear_output(wait=True)
        movies_list.append(make_request(_id=val))
    return movies_list

In [6]:
df = pd.DataFrame(get_movies(num=100000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  70692 non-null  object 
 1   backdrop_path          26073 non-null  object 
 2   belongs_to_collection  6440 non-null   object 
 3   budget                 70692 non-null  float64
 4   genres                 70692 non-null  object 
 5   homepage               67860 non-null  object 
 6   id                     70692 non-null  float64
 7   imdb_id                70692 non-null  object 
 8   original_language      70692 non-null  object 
 9   original_title         70692 non-null  object 
 10  overview               70692 non-null  object 
 11  popularity             70692 non-null  float64
 12  poster_path            48241 non-null  object 
 13  production_companies   70692 non-null  object 
 14  production_countries   70692 non-null  object 
 15  r

In [7]:
#df.to_json('tmdb_movies_100k.json')

In [8]:
df_100k = pd.read_json('tmdb_movies_100k.json')
df_100k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  70692 non-null  float64
 1   backdrop_path          26073 non-null  object 
 2   belongs_to_collection  6440 non-null   object 
 3   budget                 70692 non-null  float64
 4   genres                 70692 non-null  object 
 5   homepage               67860 non-null  object 
 6   id                     70692 non-null  float64
 7   imdb_id                70692 non-null  object 
 8   original_language      70692 non-null  object 
 9   original_title         70692 non-null  object 
 10  overview               70692 non-null  object 
 11  popularity             70692 non-null  float64
 12  poster_path            48241 non-null  object 
 13  production_companies   70692 non-null  object 
 14  production_countries   70692 non-null  object 
 15  r

In [9]:
df_100k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,,0,0,,0
backdrop_path,,/kpuTCMw3v2AuKjqGS7383uWbc8V.jpg,/jMmHFm0TcjiN9QDICXY2tJcQsDl.jpg,,/xvjGhJHsArVjCWXb5OARi0PiqvB.jpg
belongs_to_collection,,,,,
budget,,0,0,,4e+06
genres,,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...","[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name..."
homepage,,,,,
id,,2,3,,5
imdb_id,,tt0094675,tt0092149,,tt0113101
original_language,,fi,fi,,en
original_title,,Ariel,Varjoja paratiisissa,,Four Rooms


In [10]:
df_100k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,,,,,,,,,,,...,,,,,,,,,,
99996,0.0,/rOIclzR6CpSlgylNsm2gzUC8SyU.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,99997.0,tt0027196,en,West Point of the Air,...,89.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Biggest Romantic Thrill Since ""Hell Divers""",West Point of the Air,0.0,7.0,1.0,"{'cast': [{'cast_id': 4, 'character': 'Sgt. 'B...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,99998.0,tt0032130,en,Wings of the Navy,...,89.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,FOR ALL THE WORLD TO WITNESS THAT AMERICA WILL...,Wings of the Navy,0.0,7.0,2.0,"{'cast': [{'cast_id': 3, 'character': 'Cass Ha...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99998,0.0,,,0.0,"[{'id': 99, 'name': 'Documentary'}, {'id': 36,...",,99999.0,tt0156078,de,Der Sieg des Glaubens,...,61.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,,Victory of the Faith,0.0,6.8,4.0,"{'cast': [{'cast_id': 4, 'character': 'Himself...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,,,,,,,,,,,...,,,,,,,,,,


In [11]:
df = pd.DataFrame(get_movies(num=200000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  50946 non-null  object 
 1   backdrop_path          8936 non-null   object 
 2   belongs_to_collection  1868 non-null   object 
 3   budget                 50946 non-null  float64
 4   genres                 50946 non-null  object 
 5   homepage               45528 non-null  object 
 6   id                     50946 non-null  float64
 7   imdb_id                50946 non-null  object 
 8   original_language      50946 non-null  object 
 9   original_title         50946 non-null  object 
 10  overview               50946 non-null  object 
 11  popularity             50946 non-null  float64
 12  poster_path            21841 non-null  object 
 13  production_companies   50946 non-null  object 
 14  production_countries   50946 non-null  object 
 15  r

In [12]:
#df.to_json('tmdb_movies_200k.json')

In [13]:
df_200k = pd.read_json('tmdb_movies_200k.json')
df_200k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  50946 non-null  float64
 1   backdrop_path          8936 non-null   object 
 2   belongs_to_collection  1868 non-null   object 
 3   budget                 50946 non-null  float64
 4   genres                 50946 non-null  object 
 5   homepage               45528 non-null  object 
 6   id                     50946 non-null  float64
 7   imdb_id                50946 non-null  object 
 8   original_language      50946 non-null  object 
 9   original_title         50946 non-null  object 
 10  overview               50946 non-null  object 
 11  popularity             50946 non-null  float64
 12  poster_path            21841 non-null  object 
 13  production_companies   50946 non-null  object 
 14  production_countries   50946 non-null  object 
 15  r

In [14]:
df_200k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,,,,,
backdrop_path,,,,,
belongs_to_collection,,,,,
budget,,,,,
genres,,,,,
homepage,,,,,
id,,,,,
imdb_id,,,,,
original_language,,,,,
original_title,,,,,


In [15]:
df_200k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,,,,,,,,,,,...,,,,,,,,,,
99996,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,199997.0,tt2609468,hi,सोना Spa,...,102.0,"[{'iso_639_1': 'hi', 'name': 'हिन्दी'}, {'iso_...",Released,You Can Buy Your Sleep Here,Sona Spa,0.0,2.7,3.0,"{'cast': [{'cast_id': 6, 'character': 'Baba', ...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,,,,,,,,,,,...,,,,,,,,,,
99998,0.0,/d28wW9zp01Oibs09TQZkBOEjssc.jpg,,0.0,"[{'id': 10751, 'name': 'Family'}, {'id': 18, '...",http://www.thehousethatjackbuiltmovie.com/,199999.0,tt2190492,en,The House That Jack Built,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The House That Jack Built,0.0,5.3,4.0,"{'cast': [{'cast_id': 4, 'character': 'Jack', ...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,200000.0,tt1920925,en,Fish n' Chips,...,102.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Fish n' Chips,0.0,5.0,1.0,"{'cast': [], 'crew': [{'credit_id': '57b626759...","{'page': 1, 'results': [], 'total_pages': 0, '..."


In [16]:
df = pd.DataFrame(get_movies(num=300000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  69297 non-null  object 
 1   backdrop_path          8467 non-null   object 
 2   belongs_to_collection  1461 non-null   object 
 3   budget                 69297 non-null  float64
 4   genres                 69297 non-null  object 
 5   homepage               52921 non-null  object 
 6   id                     69297 non-null  float64
 7   imdb_id                65044 non-null  object 
 8   original_language      69297 non-null  object 
 9   original_title         69297 non-null  object 
 10  overview               69297 non-null  object 
 11  popularity             69297 non-null  float64
 12  poster_path            23897 non-null  object 
 13  production_companies   69297 non-null  object 
 14  production_countries   69297 non-null  object 
 15  r

In [17]:
#df.to_json('tmdb_movies_300k.json')

In [18]:
df_300k = pd.read_json('tmdb_movies_300k.json')
df_300k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  69297 non-null  float64
 1   backdrop_path          8467 non-null   object 
 2   belongs_to_collection  1461 non-null   object 
 3   budget                 69297 non-null  float64
 4   genres                 69297 non-null  object 
 5   homepage               52921 non-null  object 
 6   id                     69297 non-null  float64
 7   imdb_id                65044 non-null  object 
 8   original_language      69297 non-null  object 
 9   original_title         69297 non-null  object 
 10  overview               69297 non-null  object 
 11  popularity             69297 non-null  float64
 12  poster_path            23897 non-null  object 
 13  production_companies   69297 non-null  object 
 14  production_countries   69297 non-null  object 
 15  r

In [19]:
df_300k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,0,0,,,0
backdrop_path,,,,,
belongs_to_collection,,,,,
budget,0,0,,,0
genres,"[{'id': 99, 'name': 'Documentary'}]","[{'id': 18, 'name': 'Drama'}]",,,[]
homepage,,http://www.imdb.com/title/tt0408295/,,,http://www.go-betweens.net/discography/2005str...
id,200001,200002,,,200005
imdb_id,tt0856245,tt0408295,,,
original_language,de,fr,,,en
original_title,Nach Hollywood - Douglas Sirk erzählt,Une place parmi les vivants,,,That Striped Sunlight Sound


In [20]:
df_300k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,0.0,,,0.0,[],,299996.0,,en,Athena,...,,[],Released,,Athena,0.0,0.0,0.0,"{'cast': [], 'crew': []}","{'page': 1, 'results': [], 'total_pages': 0, '..."
99996,0.0,/2LWBbV66Dr1lmj36fQK9YzxfEyo.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,299997.0,,en,Bill Santiago: Pardon My Spanglish,...,0.0,[],Released,,Bill Santiago: Pardon My Spanglish,0.0,0.0,0.0,"{'cast': [], 'crew': []}","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,0.0,/j0uiRKL1PRurutFtI0Gut8In2lj.jpg,,100000.0,"[{'id': 18, 'name': 'Drama'}]",http://www.maverickentertainment.cc/movies/mov...,299998.0,tt2739792,en,Watch Phoenix Rise,...,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fame Has Its Downsides,Watch Phoenix Rise,0.0,0.0,0.0,"{'cast': [{'cast_id': 0, 'character': 'Phoenix...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99998,0.0,,,0.0,[],,299999.0,tt3087026,en,What We Can't Have,...,,[],Released,,What We Can't Have,0.0,0.0,0.0,"{'cast': [], 'crew': []}","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,300000.0,tt0091711,en,Overnight,...,96.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Overnight,0.0,7.0,1.0,"{'cast': [{'cast_id': 0, 'character': 'Scott L...","{'page': 1, 'results': [], 'total_pages': 0, '..."


In [21]:
df = pd.DataFrame(get_movies(num=400000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  87751 non-null  object 
 1   backdrop_path          9189 non-null   object 
 2   belongs_to_collection  1384 non-null   object 
 3   budget                 87751 non-null  float64
 4   genres                 87751 non-null  object 
 5   homepage               59405 non-null  object 
 6   id                     87751 non-null  float64
 7   imdb_id                72977 non-null  object 
 8   original_language      87751 non-null  object 
 9   original_title         87751 non-null  object 
 10  overview               87751 non-null  object 
 11  popularity             87751 non-null  float64
 12  poster_path            25581 non-null  object 
 13  production_companies   87751 non-null  object 
 14  production_countries   87751 non-null  object 
 15  r

In [22]:
#df.to_json('tmdb_movies_400k.json')

In [23]:
df_400k = pd.read_json('tmdb_movies_400k.json')
df_400k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  87751 non-null  float64
 1   backdrop_path          9189 non-null   object 
 2   belongs_to_collection  1384 non-null   object 
 3   budget                 87751 non-null  float64
 4   genres                 87751 non-null  object 
 5   homepage               59405 non-null  object 
 6   id                     87751 non-null  float64
 7   imdb_id                72977 non-null  object 
 8   original_language      87751 non-null  object 
 9   original_title         87751 non-null  object 
 10  overview               87751 non-null  object 
 11  popularity             87751 non-null  float64
 12  poster_path            25581 non-null  object 
 13  production_companies   87751 non-null  object 
 14  production_countries   87751 non-null  object 
 15  r

In [24]:
df_400k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,0,,0,0,0
backdrop_path,,,,,
belongs_to_collection,,,,,
budget,0,,0,0,0
genres,[],,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",[],[]
homepage,,,,,
id,300001,,300003,300004,300005
imdb_id,,,tt3066658,tt2750628,
original_language,en,,en,en,en
original_title,Blackhawk: Greatest Hits Live,,Wild Justice,What Is New Thought?,Born this Way


In [25]:
df_400k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,,,,,,,,,,,...,,,,,,,,,,
99996,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,399997.0,tt5530520,en,Kate & Lily,...,15.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Kate & Lily,0.0,0.0,0.0,"{'cast': [{'cast_id': 3, 'character': 'Kate', ...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,0.0,,,0.0,"[{'id': 10751, 'name': 'Family'}]",,399998.0,tt0763361,en,Mauro the Gypsy,...,57.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Mauro the Gypsy,0.0,0.0,0.0,"{'cast': [{'cast_id': 1, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99998,0.0,/lWk7kaz6iFYYfWiSAHxcVPOvS2Z.jpg,,0.0,[],,399999.0,,en,The Mine and the Minotaur,...,59.0,[],Released,,The Mine and the Minotaur,0.0,0.0,0.0,"{'cast': [{'cast_id': 0, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,0.0,,,0.0,"[{'id': 10751, 'name': 'Family'}]",,400000.0,tt0247526,en,Mischief,...,57.0,[],Released,,Mischief,0.0,0.0,0.0,"{'cast': [{'cast_id': 0, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."


In [26]:
df = pd.DataFrame(get_movies(num=500000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  85963 non-null  object 
 1   backdrop_path          18199 non-null  object 
 2   belongs_to_collection  1312 non-null   object 
 3   budget                 85963 non-null  float64
 4   genres                 85963 non-null  object 
 5   homepage               39202 non-null  object 
 6   id                     85963 non-null  float64
 7   imdb_id                65495 non-null  object 
 8   original_language      85963 non-null  object 
 9   original_title         85963 non-null  object 
 10  overview               85963 non-null  object 
 11  popularity             85963 non-null  float64
 12  poster_path            49978 non-null  object 
 13  production_companies   85963 non-null  object 
 14  production_countries   85963 non-null  object 
 15  r

In [27]:
#df.to_json('tmdb_movies_500k.json')

In [28]:
df_500k = pd.read_json('tmdb_movies_500k.json')
df_500k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  85963 non-null  float64
 1   backdrop_path          18199 non-null  object 
 2   belongs_to_collection  1312 non-null   object 
 3   budget                 85963 non-null  float64
 4   genres                 85963 non-null  object 
 5   homepage               39202 non-null  object 
 6   id                     85963 non-null  float64
 7   imdb_id                65495 non-null  object 
 8   original_language      85963 non-null  object 
 9   original_title         85963 non-null  object 
 10  overview               85963 non-null  object 
 11  popularity             85963 non-null  float64
 12  poster_path            49978 non-null  object 
 13  production_companies   85963 non-null  object 
 14  production_countries   85963 non-null  object 
 15  r

In [29]:
df_500k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,0,,,,
backdrop_path,,,,,
belongs_to_collection,,,,,
budget,0,,,,
genres,"[{'id': 10749, 'name': 'Romance'}]",,,,
homepage,http://sic.conaculta.gob.mx/ficha.php?table=pr...,,,,
id,400001,,,,
imdb_id,tt4145318,,,,
original_language,es,,,,
original_title,Trémulo,,,,


In [30]:
df_500k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 28, 'nam...",,499996.0,tt2091247,tl,Bornebol: Special Agent,...,,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Bornebol: Special Agent,0.0,0.0,0.0,"{'cast': [{'cast_id': 0, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99996,0.0,,,0.0,"[{'id': 28, 'name': 'Action'}]",http://rwblog.id,499997.0,,id,21 Petang Berdarah,...,37.0,"[{'iso_639_1': 'id', 'name': 'Bahasa indonesia'}]",Released,,21 Blood Night,0.0,0.0,0.0,"{'cast': [{'cast_id': 0, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,0.0,,,0.0,"[{'id': 10770, 'name': 'TV Movie'}]",,499998.0,tt0115982,en,Crossing the Floor,...,70.0,[],Released,,Crossing the Floor,0.0,6.7,3.0,"{'cast': [{'cast_id': 0, 'character': 'David H...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99998,0.0,,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,499999.0,,id,Bombe',...,92.0,"[{'iso_639_1': 'id', 'name': 'Bahasa indonesia'}]",Released,,Bombe',0.0,0.0,0.0,"{'cast': [{'cast_id': 0, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,500000.0,,id,Bombe’ 2: Dumba’-Dumba’,...,90.0,"[{'iso_639_1': 'id', 'name': 'Bahasa indonesia'}]",Released,,Bombe’ 2: Dumba’-Dumba’,0.0,0.0,0.0,"{'cast': [{'cast_id': 3, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."


In [31]:
df = pd.DataFrame(get_movies(num=600000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  88747 non-null  object 
 1   backdrop_path          20530 non-null  object 
 2   belongs_to_collection  709 non-null    object 
 3   budget                 88747 non-null  float64
 4   genres                 88747 non-null  object 
 5   homepage               19565 non-null  object 
 6   id                     88747 non-null  float64
 7   imdb_id                53314 non-null  object 
 8   original_language      88747 non-null  object 
 9   original_title         88747 non-null  object 
 10  overview               88747 non-null  object 
 11  popularity             88747 non-null  float64
 12  poster_path            66597 non-null  object 
 13  production_companies   88747 non-null  object 
 14  production_countries   88747 non-null  object 
 15  r

In [32]:
#df.to_json('tmdb_movies_600k.json')

In [33]:
df_600k = pd.read_json('tmdb_movies_600k.json')
df_600k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  88747 non-null  float64
 1   backdrop_path          20530 non-null  object 
 2   belongs_to_collection  709 non-null    object 
 3   budget                 88747 non-null  float64
 4   genres                 88747 non-null  object 
 5   homepage               19565 non-null  object 
 6   id                     88747 non-null  float64
 7   imdb_id                53314 non-null  object 
 8   original_language      88747 non-null  object 
 9   original_title         88747 non-null  object 
 10  overview               88747 non-null  object 
 11  popularity             88747 non-null  float64
 12  poster_path            66597 non-null  object 
 13  production_companies   88747 non-null  object 
 14  production_countries   88747 non-null  object 
 15  r

In [34]:
df_600k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,,,0,0,0
backdrop_path,,,/j1gLGTZx9Jy8TDJpR250IAwp3Gg.jpg,/9kg7HWZcBvvxf0b7duGVBsaPCyn.jpg,
belongs_to_collection,,,,,
budget,,,0,0,0
genres,,,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",[],"[{'id': 99, 'name': 'Documentary'}]"
homepage,,,https://www.cinemand.co/,,
id,,,500003,500004,500005
imdb_id,,,tt7422552,tt5522416,tt6507448
original_language,,,en,tr,it
original_title,,,Funny Story,Emicem Hospital,Io sono Valentina Nappi


In [35]:
df_600k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,0.0,,,0.0,[],http://prudefilm.com/,599996.0,,en,Prude,...,10.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Prude,0.0,0.0,0.0,"{'cast': [{'cast_id': 4, 'character': 'Violet'...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99996,0.0,,,0.0,[],,599997.0,,en,Σουτζουκάκια,...,15.0,"[{'iso_639_1': 'el', 'name': 'ελληνικά'}]",Released,,Meatballs,0.0,0.0,0.0,"{'cast': [{'cast_id': 1, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,0.0,,,0.0,[],,599998.0,,zh,Block and Censor,...,65.0,[],Released,,Block and Censor,0.0,0.0,0.0,"{'cast': [], 'crew': [{'credit_id': '5ccde6b90...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99998,0.0,/iBredN64vmai40N2cNv0FKKz7Fc.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,599999.0,tt6198058,en,Into the Mirror,...,65.0,[],Released,,Into the Mirror,0.0,3.0,3.0,"{'cast': [{'cast_id': 2, 'character': 'Daniel'...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,0.0,,,0.0,[],http://www.taliashealevin.com/eta,600000.0,,en,E T A,...,5.0,[],Released,,E T A,0.0,0.0,0.0,"{'cast': [{'cast_id': 3, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."


In [36]:
df = pd.DataFrame(get_movies(num=700000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  90827 non-null  object 
 1   backdrop_path          16387 non-null  object 
 2   belongs_to_collection  478 non-null    object 
 3   budget                 90827 non-null  float64
 4   genres                 90827 non-null  object 
 5   homepage               51683 non-null  object 
 6   id                     90827 non-null  float64
 7   imdb_id                44196 non-null  object 
 8   original_language      90827 non-null  object 
 9   original_title         90827 non-null  object 
 10  overview               90827 non-null  object 
 11  popularity             90827 non-null  float64
 12  poster_path            62820 non-null  object 
 13  production_companies   90827 non-null  object 
 14  production_countries   90827 non-null  object 
 15  r

In [37]:
#df.to_json('tmdb_movies_700k.json')

In [38]:
df_700k = pd.read_json('tmdb_movies_700k.json')
df_700k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  90827 non-null  float64
 1   backdrop_path          16387 non-null  object 
 2   belongs_to_collection  478 non-null    object 
 3   budget                 90827 non-null  float64
 4   genres                 90827 non-null  object 
 5   homepage               51683 non-null  object 
 6   id                     90827 non-null  float64
 7   imdb_id                44196 non-null  object 
 8   original_language      90827 non-null  object 
 9   original_title         90827 non-null  object 
 10  overview               90827 non-null  object 
 11  popularity             90827 non-null  float64
 12  poster_path            62820 non-null  object 
 13  production_companies   90827 non-null  object 
 14  production_countries   90827 non-null  object 
 15  r

In [39]:
df_700k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,0,,,,
backdrop_path,,,,,
belongs_to_collection,,,,,
budget,0,,,,
genres,[],,,,
homepage,,,,,
id,600001,,,,
imdb_id,tt2750132,,,,
original_language,de,,,,
original_title,Claire,,,,


In [40]:
df_700k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,0.0,,,0.0,[],https://www.yesterway.org/,699996.0,,en,Yesterway,...,107.0,[],Released,,Yesterway,0.0,0.0,0.0,"{'cast': [], 'crew': [{'credit_id': '5eaef39d0...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99996,0.0,,,0.0,[],,699997.0,tt4420000,en,Kalopsia,...,12.0,[],Released,,Kalopsia,0.0,0.0,0.0,"{'cast': [{'cast_id': 1, 'character': 'Alice',...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99997,0.0,,,0.0,[],,699998.0,tt11872724,es,Poderoso Victoria,...,0.0,"[{'iso_639_1': 'es', 'name': 'Español'}]",Post Production,,Mighty Victoria,0.0,0.0,0.0,"{'cast': [{'cast_id': 7, 'character': 'Don Fed...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99998,0.0,,,0.0,[],,699999.0,tt0324364,fr,Is Dead - Portrait incomplet de Gertrude Stein,...,47.0,[],Released,,Is Dead - Portrait incomplet de Gertrude Stein,0.0,0.0,0.0,"{'cast': [{'cast_id': 1, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."
99999,0.0,,,0.0,[],,700000.0,,fr,Degas et moi,...,20.0,[],Released,,Degas et moi,0.0,0.0,0.0,"{'cast': [{'cast_id': 2, 'character': '', 'cre...","{'page': 1, 'results': [], 'total_pages': 0, '..."


In [41]:
df = pd.DataFrame(get_movies(num=800000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  1592 non-null   object 
 1   backdrop_path          206 non-null    object 
 2   belongs_to_collection  14 non-null     object 
 3   budget                 1592 non-null   float64
 4   genres                 1592 non-null   object 
 5   homepage               1592 non-null   object 
 6   id                     1592 non-null   float64
 7   imdb_id                627 non-null    object 
 8   original_language      1592 non-null   object 
 9   original_title         1592 non-null   object 
 10  overview               1592 non-null   object 
 11  popularity             1592 non-null   float64
 12  poster_path            846 non-null    object 
 13  production_companies   1592 non-null   object 
 14  production_countries   1592 non-null   object 
 15  r

In [42]:
#df.to_json('tmdb_movies_800k.json')

In [43]:
df_800k = pd.read_json('tmdb_movies_800k.json')
df_800k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  1592 non-null   float64
 1   backdrop_path          206 non-null    object 
 2   belongs_to_collection  14 non-null     object 
 3   budget                 1592 non-null   float64
 4   genres                 1592 non-null   object 
 5   homepage               1592 non-null   object 
 6   id                     1592 non-null   float64
 7   imdb_id                627 non-null    object 
 8   original_language      1592 non-null   object 
 9   original_title         1592 non-null   object 
 10  overview               1592 non-null   object 
 11  popularity             1592 non-null   float64
 12  poster_path            846 non-null    object 
 13  production_companies   1592 non-null   object 
 14  production_countries   1592 non-null   object 
 15  r

In [44]:
df_800k.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,0,0,0,,0
backdrop_path,,,,,
belongs_to_collection,,,,,
budget,0,0,0,,0
genres,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",[],"[{'id': 35, 'name': 'Comedy'}, {'id': 12, 'nam...",,"[{'id': 99, 'name': 'Documentary'}]"
homepage,,,,,
id,700001,700002,700003,,700005
imdb_id,tt0279235,,,,
original_language,cs,es,en,,en
original_title,Mnichovo srdce,Chancho en Piedra - Chancho 6,Who Stole My Sofa?,,Birds of a Feather


In [45]:
df_800k.tail()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits,reviews
99995,,,,,,,,,,,...,,,,,,,,,,
99996,,,,,,,,,,,...,,,,,,,,,,
99997,,,,,,,,,,,...,,,,,,,,,,
99998,,,,,,,,,,,...,,,,,,,,,,
99999,,,,,,,,,,,...,,,,,,,,,,


In [46]:
raise SystemExit("Stop right there!")

SystemExit: Stop right there!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# latest = 701125