In [None]:
raise SystemExit("Stop right there!")

In [1]:
import requests
import json
import pandas as pd
from IPython.display import clear_output
from time import sleep
import numpy as np

In [2]:
# retrieve my TMDb key
local_file = 'tmdb_key.txt'
with open(local_file, 'r') as api_file:
    my_key = api_file.read().rstrip('\r\n')
    api_file.close()

In [3]:
# set first part of URL for every request
base_url = 'https://api.themoviedb.org/3/'

In [4]:
def make_request(_id, prior_attempts=0):
    """Makes requests for all person objects in TMDb database and stores them in a list of JSON's"""
    if prior_attempts == 3:
        return {}
    url = base_url + 'person/' + str(_id) + '?api_key=' + my_key
    response = requests.get(url)
    # entry not found
    if response.status_code == 404:
        return {}
    # exceed rate limit
    if response.status_code == 429:
        sleep(10)
        return make_request(_id=_id)
    # success
    if response.status_code == 200:
        return response.json()
    else:
        sleep(1)
        return make_request(_id=_id, prior_attempts=prior_attempts + 1)

In [5]:
def get_people(num):
    """Makes requests for all person objects in TMDb database and stores them in a list of JSON's"""
    people_list = []
    for val in range(num - 99999, num + 1):
        print(val)
        clear_output(wait=True)
        people_list.append(make_request(_id=val))
    return people_list

In [6]:
df_100k = pd.read_json('tmdb_person_100k.json')
df_100k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              36541 non-null  object 
 1   known_for_department  84356 non-null  object 
 2   deathday              12317 non-null  object 
 3   id                    84367 non-null  float64
 4   name                  84367 non-null  object 
 5   also_known_as         84367 non-null  object 
 6   gender                84367 non-null  float64
 7   biography             84367 non-null  object 
 8   popularity            84367 non-null  float64
 9   place_of_birth        30895 non-null  object 
 10  profile_path          20826 non-null  object 
 11  adult                 84367 non-null  float64
 12  imdb_id               84367 non-null  object 
 13  homepage              3667 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [7]:
df_100k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,1944-05-14,1951-09-25,1942-07-13,1956-10-21,1913-05-26
known_for_department,Directing,Acting,Acting,Acting,Acting
deathday,,,,2016-12-27,1994-08-11
id,1,2,3,4,5
name,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Cushing
also_known_as,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...","[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",[Peter Wilton Cushing]
gender,2,2,2,1,2
biography,"George Walton Lucas Jr. (born May 14, 1944) is...","Mark Richard Hamill (born September 25, 1951) ...",Legendary Hollywood Icon Harrison Ford was bor...,Carrie Frances Fisher (21 October 1956 - 27 De...,"Peter Wilton Cushing, OBE (26 May 1913 – 11 A..."
popularity,6.761,10.497,20.349,7.228,3.902
place_of_birth,"Modesto, California, USA","Concord, California, USA","Chicago, Illinois, USA","Beverly Hills, Los Angeles, California, USA","Kenley, Surrey, England, UK"


In [8]:
df_100k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,99996.0,Scooter McCrae,[],0.0,,0.631,,,0.0,nm0566885,
99996,,Directing,,99997.0,Eric Thornett,[],0.0,,0.6,,,0.0,nm1204419,
99997,,Acting,,99998.0,Jason Wauer,[],0.0,,0.6,,,0.0,nm1312267,
99998,,Acting,,99999.0,Mr. ?,[],0.0,,0.6,,,0.0,nm2425757,
99999,,Acting,,100000.0,Demetrius Parker,[],0.0,,0.6,,,0.0,nm1203784,


In [9]:
df = pd.DataFrame(get_people(num=200000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              17683 non-null  object 
 1   known_for_department  53328 non-null  object 
 2   deathday              6015 non-null   object 
 3   id                    53342 non-null  float64
 4   name                  53342 non-null  object 
 5   also_known_as         53342 non-null  object 
 6   gender                53342 non-null  float64
 7   biography             53342 non-null  object 
 8   popularity            53342 non-null  float64
 9   place_of_birth        14275 non-null  object 
 10  profile_path          11130 non-null  object 
 11  adult                 53342 non-null  object 
 12  imdb_id               53342 non-null  object 
 13  homepage              1672 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [10]:
#df.to_json('tmdb_person_200k.json')

In [11]:
df_200k = pd.read_json('tmdb_person_200k.json')
df_200k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              17683 non-null  object 
 1   known_for_department  53328 non-null  object 
 2   deathday              6015 non-null   object 
 3   id                    53342 non-null  float64
 4   name                  53342 non-null  object 
 5   also_known_as         53342 non-null  object 
 6   gender                53342 non-null  float64
 7   biography             53342 non-null  object 
 8   popularity            53342 non-null  float64
 9   place_of_birth        14275 non-null  object 
 10  profile_path          11130 non-null  object 
 11  adult                 53342 non-null  float64
 12  imdb_id               53342 non-null  object 
 13  homepage              1672 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [12]:
df_200k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Acting,Acting,Acting,Acting,Acting
deathday,,,,,
id,100001,100002,100003,100004,100005
name,Peter Smak,Christi Etcher,Clancey McCauley,Theresa Hoyt,Tara Durham
also_known_as,[],[],[Clancy McCauley],[],[]
gender,0,0,1,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [13]:
df_200k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,Acting,,199997.0,Jean O'Neill,[],0.0,,0.6,,,0.0,nm0642196,
99997,,Production,,199998.0,Jim Milio,[],0.0,,0.6,,,0.0,,
99998,,Acting,,199999.0,Alistair Cooke,[],0.0,,0.6,,,0.0,,
99999,,,,,,,,,,,,,,


In [14]:
df = pd.DataFrame(get_people(num=300000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6169 non-null   object 
 1   known_for_department  20913 non-null  object 
 2   deathday              1398 non-null   object 
 3   id                    20924 non-null  float64
 4   name                  20924 non-null  object 
 5   also_known_as         20924 non-null  object 
 6   gender                20924 non-null  float64
 7   biography             20924 non-null  object 
 8   popularity            20924 non-null  float64
 9   place_of_birth        4969 non-null   object 
 10  profile_path          4342 non-null   object 
 11  adult                 20924 non-null  object 
 12  imdb_id               20924 non-null  object 
 13  homepage              835 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [15]:
#df.to_json('tmdb_person_300k.json')

In [16]:
df_300k = pd.read_json('tmdb_person_300k.json')
df_300k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6169 non-null   object 
 1   known_for_department  20913 non-null  object 
 2   deathday              1398 non-null   object 
 3   id                    20924 non-null  float64
 4   name                  20924 non-null  object 
 5   also_known_as         20924 non-null  object 
 6   gender                20924 non-null  float64
 7   biography             20924 non-null  object 
 8   popularity            20924 non-null  float64
 9   place_of_birth        4969 non-null   object 
 10  profile_path          4342 non-null   object 
 11  adult                 20924 non-null  float64
 12  imdb_id               20924 non-null  object 
 13  homepage              835 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [17]:
df_300k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,Acting,,,Acting
deathday,,,,,
id,,200002,,,200005
name,,Nicole Bailey,,,Akeem Smith
also_known_as,,[],,,[]
gender,,0,,,0
biography,,,,,
popularity,,0.6,,,0.6
place_of_birth,,,,,


In [18]:
df_300k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [19]:
df = pd.DataFrame(get_people(num=400000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              283 non-null    object 
 1   known_for_department  948 non-null    object 
 2   deathday              105 non-null    object 
 3   id                    948 non-null    float64
 4   name                  948 non-null    object 
 5   also_known_as         948 non-null    object 
 6   gender                948 non-null    float64
 7   biography             948 non-null    object 
 8   popularity            948 non-null    float64
 9   place_of_birth        194 non-null    object 
 10  profile_path          166 non-null    object 
 11  adult                 948 non-null    object 
 12  imdb_id               948 non-null    object 
 13  homepage              20 non-null     object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [20]:
#df.to_json('tmdb_person_400k.json')

In [21]:
df_400k = pd.read_json('tmdb_person_400k.json')
df_400k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              283 non-null    object 
 1   known_for_department  948 non-null    object 
 2   deathday              105 non-null    object 
 3   id                    948 non-null    float64
 4   name                  948 non-null    object 
 5   also_known_as         948 non-null    object 
 6   gender                948 non-null    float64
 7   biography             948 non-null    object 
 8   popularity            948 non-null    float64
 9   place_of_birth        194 non-null    object 
 10  profile_path          166 non-null    object 
 11  adult                 948 non-null    float64
 12  imdb_id               948 non-null    object 
 13  homepage              20 non-null     object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [22]:
df_400k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,1901-07-29,,
known_for_department,,,Acting,,
deathday,,,1975-04-15,,
id,,,300003,,
name,,,Magnus Kesster,,
also_known_as,,,[],,
gender,,,2,,
biography,,,,,
popularity,,,0.6,,
place_of_birth,,,"Stockholm, Stockholms län, Sweden",,


In [23]:
df_400k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [24]:
df = pd.DataFrame(get_people(num=500000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              405 non-null    object 
 1   known_for_department  1222 non-null   object 
 2   deathday              64 non-null     object 
 3   id                    1222 non-null   float64
 4   name                  1222 non-null   object 
 5   also_known_as         1222 non-null   object 
 6   gender                1222 non-null   float64
 7   biography             1222 non-null   object 
 8   popularity            1222 non-null   float64
 9   place_of_birth        285 non-null    object 
 10  profile_path          231 non-null    object 
 11  adult                 1222 non-null   object 
 12  imdb_id               1222 non-null   object 
 13  homepage              45 non-null     object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [25]:
#df.to_json('tmdb_person_500k.json')

In [26]:
df_500k = pd.read_json('tmdb_person_500k.json')
df_500k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              405 non-null    object 
 1   known_for_department  1222 non-null   object 
 2   deathday              64 non-null     object 
 3   id                    1222 non-null   float64
 4   name                  1222 non-null   object 
 5   also_known_as         1222 non-null   object 
 6   gender                1222 non-null   float64
 7   biography             1222 non-null   object 
 8   popularity            1222 non-null   float64
 9   place_of_birth        285 non-null    object 
 10  profile_path          231 non-null    object 
 11  adult                 1222 non-null   float64
 12  imdb_id               1222 non-null   object 
 13  homepage              45 non-null     object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [27]:
df_500k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [28]:
df_500k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [29]:
df = pd.DataFrame(get_people(num=600000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6494 non-null   object 
 1   known_for_department  29241 non-null  object 
 2   deathday              1997 non-null   object 
 3   id                    29246 non-null  float64
 4   name                  29246 non-null  object 
 5   also_known_as         29246 non-null  object 
 6   gender                29246 non-null  float64
 7   biography             29246 non-null  object 
 8   popularity            29246 non-null  float64
 9   place_of_birth        5187 non-null   object 
 10  profile_path          4889 non-null   object 
 11  adult                 29246 non-null  object 
 12  imdb_id               29246 non-null  object 
 13  homepage              713 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [30]:
#df.to_json('tmdb_person_600k.json')

In [31]:
df_600k = pd.read_json('tmdb_person_600k.json')
df_600k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6494 non-null   object 
 1   known_for_department  29241 non-null  object 
 2   deathday              1997 non-null   object 
 3   id                    29246 non-null  float64
 4   name                  29246 non-null  object 
 5   also_known_as         29246 non-null  object 
 6   gender                29246 non-null  float64
 7   biography             29246 non-null  object 
 8   popularity            29246 non-null  float64
 9   place_of_birth        5187 non-null   object 
 10  profile_path          4889 non-null   object 
 11  adult                 29246 non-null  float64
 12  imdb_id               29246 non-null  object 
 13  homepage              713 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [32]:
df_600k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [33]:
df_600k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [34]:
df = pd.DataFrame(get_people(num=700000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              143 non-null    object 
 1   known_for_department  440 non-null    object 
 2   deathday              73 non-null     object 
 3   id                    440 non-null    float64
 4   name                  440 non-null    object 
 5   also_known_as         440 non-null    object 
 6   gender                440 non-null    float64
 7   biography             440 non-null    object 
 8   popularity            440 non-null    float64
 9   place_of_birth        103 non-null    object 
 10  profile_path          104 non-null    object 
 11  adult                 440 non-null    object 
 12  imdb_id               440 non-null    object 
 13  homepage              4 non-null      object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [35]:
#df.to_json('tmdb_person_700k.json')

In [36]:
df_700k = pd.read_json('tmdb_person_700k.json')
df_700k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              143 non-null    object 
 1   known_for_department  440 non-null    object 
 2   deathday              73 non-null     object 
 3   id                    440 non-null    float64
 4   name                  440 non-null    object 
 5   also_known_as         440 non-null    object 
 6   gender                440 non-null    float64
 7   biography             440 non-null    object 
 8   popularity            440 non-null    float64
 9   place_of_birth        103 non-null    object 
 10  profile_path          104 non-null    object 
 11  adult                 440 non-null    float64
 12  imdb_id               440 non-null    object 
 13  homepage              4 non-null      object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [37]:
df_700k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [38]:
df_700k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [39]:
df = pd.DataFrame(get_people(num=800000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              33 non-null     object 
 1   known_for_department  163 non-null    object 
 2   deathday              17 non-null     object 
 3   id                    163 non-null    float64
 4   name                  163 non-null    object 
 5   also_known_as         163 non-null    object 
 6   gender                163 non-null    float64
 7   biography             163 non-null    object 
 8   popularity            163 non-null    float64
 9   place_of_birth        24 non-null     object 
 10  profile_path          26 non-null     object 
 11  adult                 163 non-null    object 
 12  imdb_id               163 non-null    object 
 13  homepage              3 non-null      object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [40]:
#df.to_json('tmdb_person_800k.json')

In [41]:
df_800k = pd.read_json('tmdb_person_800k.json')
df_800k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              33 non-null     object 
 1   known_for_department  163 non-null    object 
 2   deathday              17 non-null     object 
 3   id                    163 non-null    float64
 4   name                  163 non-null    object 
 5   also_known_as         163 non-null    object 
 6   gender                163 non-null    float64
 7   biography             163 non-null    object 
 8   popularity            163 non-null    float64
 9   place_of_birth        24 non-null     object 
 10  profile_path          26 non-null     object 
 11  adult                 163 non-null    float64
 12  imdb_id               163 non-null    object 
 13  homepage              3 non-null      object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [42]:
df_800k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [43]:
df_800k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [44]:
df = pd.DataFrame(get_people(num=900000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              54 non-null     object 
 1   known_for_department  284 non-null    object 
 2   deathday              4 non-null      object 
 3   id                    284 non-null    float64
 4   name                  284 non-null    object 
 5   also_known_as         284 non-null    object 
 6   gender                284 non-null    float64
 7   biography             284 non-null    object 
 8   popularity            284 non-null    float64
 9   place_of_birth        42 non-null     object 
 10  profile_path          47 non-null     object 
 11  adult                 284 non-null    object 
 12  imdb_id               284 non-null    object 
 13  homepage              8 non-null      object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [45]:
#df.to_json('tmdb_person_900k.json')

In [46]:
df_900k = pd.read_json('tmdb_person_900k.json')
df_900k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              54 non-null     object 
 1   known_for_department  284 non-null    object 
 2   deathday              4 non-null      object 
 3   id                    284 non-null    float64
 4   name                  284 non-null    object 
 5   also_known_as         284 non-null    object 
 6   gender                284 non-null    float64
 7   biography             284 non-null    object 
 8   popularity            284 non-null    float64
 9   place_of_birth        42 non-null     object 
 10  profile_path          47 non-null     object 
 11  adult                 284 non-null    float64
 12  imdb_id               284 non-null    object 
 13  homepage              8 non-null      object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [47]:
df_900k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [48]:
df_900k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,,,,,,,,,,,,,
99996,,,,,,,,,,,,,,
99997,,,,,,,,,,,,,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [49]:
df = pd.DataFrame(get_people(num=1000000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              3925 non-null   object 
 1   known_for_department  18373 non-null  object 
 2   deathday              1176 non-null   object 
 3   id                    18382 non-null  float64
 4   name                  18382 non-null  object 
 5   also_known_as         18382 non-null  object 
 6   gender                18382 non-null  float64
 7   biography             18382 non-null  object 
 8   popularity            18382 non-null  float64
 9   place_of_birth        3046 non-null   object 
 10  profile_path          2845 non-null   object 
 11  adult                 18382 non-null  object 
 12  imdb_id               18382 non-null  object 
 13  homepage              413 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [50]:
#df.to_json('tmdb_person_1000k.json')

In [51]:
df_1000k = pd.read_json('tmdb_person_1000k.json')
df_1000k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              3925 non-null   object 
 1   known_for_department  18373 non-null  object 
 2   deathday              1176 non-null   object 
 3   id                    18382 non-null  float64
 4   name                  18382 non-null  object 
 5   also_known_as         18382 non-null  object 
 6   gender                18382 non-null  float64
 7   biography             18382 non-null  object 
 8   popularity            18382 non-null  float64
 9   place_of_birth        3046 non-null   object 
 10  profile_path          2845 non-null   object 
 11  adult                 18382 non-null  float64
 12  imdb_id               18382 non-null  object 
 13  homepage              413 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [52]:
df_1000k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [53]:
df_1000k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,1922-06-22,Acting,1973-12-17,999996.0,María Douglas,"[Mary Douglas, María Duglas, Claudia Monterde]",1.0,"María Douglas (June 22, 1922 – December 17, 19...",0.6,"Mexico City, Mexico",/y19u8s3ur4xDqLUPHQjb053HWqK.jpg,0.0,nm0235177,
99996,,Acting,,999997.0,Alejandro Lugo,[],0.0,,0.6,,,0.0,nm0525231,
99997,,,,,,,,,,,,,,
99998,,Acting,,999999.0,Osvaldo Bonet,[],0.0,,0.6,,,0.0,nm0094329,
99999,,,,,,,,,,,,,,


In [54]:
df = pd.DataFrame(get_people(num=1100000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6914 non-null   object 
 1   known_for_department  45484 non-null  object 
 2   deathday              2351 non-null   object 
 3   id                    45496 non-null  float64
 4   name                  45496 non-null  object 
 5   also_known_as         45496 non-null  object 
 6   gender                45496 non-null  float64
 7   biography             45496 non-null  object 
 8   popularity            45496 non-null  float64
 9   place_of_birth        5544 non-null   object 
 10  profile_path          5417 non-null   object 
 11  adult                 45496 non-null  object 
 12  imdb_id               45496 non-null  object 
 13  homepage              871 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [55]:
#df.to_json('tmdb_person_1100k.json')

In [56]:
df_1100k = pd.read_json('tmdb_person_1100k.json')
df_1100k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6914 non-null   object 
 1   known_for_department  45484 non-null  object 
 2   deathday              2351 non-null   object 
 3   id                    45496 non-null  float64
 4   name                  45496 non-null  object 
 5   also_known_as         45496 non-null  object 
 6   gender                45496 non-null  float64
 7   biography             45496 non-null  object 
 8   popularity            45496 non-null  float64
 9   place_of_birth        5544 non-null   object 
 10  profile_path          5417 non-null   object 
 11  adult                 45496 non-null  float64
 12  imdb_id               45496 non-null  object 
 13  homepage              871 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [57]:
df_1100k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,,,,
deathday,,,,,
id,,,,,
name,,,,,
also_known_as,,,,,
gender,,,,,
biography,,,,,
popularity,,,,,
place_of_birth,,,,,


In [58]:
df_1100k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,1986-04-16,Acting,,1099996.0,Paul Michael McIntosh,[],0.0,,0.6,"Dechmont, Scotland, UK",,0.0,nm5120031,
99996,,Acting,,1099997.0,Tom Leon,[],0.0,,0.6,,,0.0,nm5120799,
99997,,,,,,,,,,,,,,
99998,1974-07-11,Writing,,1099999.0,Denis Rodimin,[Денис Родимин],2.0,,1.384,"Moscow, RSFSR, USSR",/gfpEtSLzBWztHiB5ULeTK95EOUQ.jpg,0.0,nm0734840,
99999,1972-03-20,Writing,,1100000.0,Aleksey Kublitskiy,[Алексей Кублицкий],2.0,,0.6,,,0.0,nm1672378,


In [59]:
df = pd.DataFrame(get_people(num=1200000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              8802 non-null   object 
 1   known_for_department  67362 non-null  object 
 2   deathday              2701 non-null   object 
 3   id                    67371 non-null  float64
 4   name                  67371 non-null  object 
 5   also_known_as         67371 non-null  object 
 6   gender                67371 non-null  float64
 7   biography             67371 non-null  object 
 8   popularity            67371 non-null  float64
 9   place_of_birth        7317 non-null   object 
 10  profile_path          7304 non-null   object 
 11  adult                 67371 non-null  object 
 12  imdb_id               67371 non-null  object 
 13  homepage              1241 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [60]:
#df.to_json('tmdb_person_1200k.json')

In [61]:
df_1200k = pd.read_json('tmdb_person_1200k.json')
df_1200k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              8802 non-null   object 
 1   known_for_department  67362 non-null  object 
 2   deathday              2701 non-null   object 
 3   id                    67371 non-null  float64
 4   name                  67371 non-null  object 
 5   also_known_as         67371 non-null  object 
 6   gender                67371 non-null  float64
 7   biography             67371 non-null  object 
 8   popularity            67371 non-null  float64
 9   place_of_birth        7317 non-null   object 
 10  profile_path          7304 non-null   object 
 11  adult                 67371 non-null  float64
 12  imdb_id               67371 non-null  object 
 13  homepage              1241 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [62]:
df_1200k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,1970-12-24,,
known_for_department,Acting,Acting,Acting,Acting,Acting
deathday,,,,,
id,1.1e+06,1.1e+06,1.1e+06,1.1e+06,1.10000e+06
name,Martin Cook,Nick Beggs,Marco Minnemann,Theo Travis,Niko Tsonev
also_known_as,[],[],[],[],[]
gender,0,0,2,0,0
biography,,,,,
popularity,0.6,0.6,0.608,0.6,0.6
place_of_birth,,,"Hannover, German",,


In [63]:
df_1200k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,1199996.0,Amber Taylor,[],1.0,,0.6,,,0.0,nm0851958,
99996,,Writing,,1199997.0,Paul Daza,[],0.0,,0.6,,,0.0,nm2413976,
99997,,,,,,,,,,,,,,
99998,1962-09-09,Acting,,1199999.0,Lehua Reid,[],1.0,,0.6,,,0.0,nm0717332,
99999,,Directing,,1200000.0,Murali Nagavally,[],0.0,,0.6,,,0.0,nm1199770,


In [64]:
df = pd.DataFrame(get_people(num=1300000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              11475 non-null  object 
 1   known_for_department  83903 non-null  object 
 2   deathday              2377 non-null   object 
 3   id                    83969 non-null  float64
 4   name                  83969 non-null  object 
 5   also_known_as         83969 non-null  object 
 6   gender                83969 non-null  float64
 7   biography             83969 non-null  object 
 8   popularity            83969 non-null  float64
 9   place_of_birth        9323 non-null   object 
 10  profile_path          9154 non-null   object 
 11  adult                 83969 non-null  object 
 12  imdb_id               83969 non-null  object 
 13  homepage              1747 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [65]:
#df.to_json('tmdb_person_1300k.json')

In [66]:
df_1300k = pd.read_json('tmdb_person_1300k.json')
df_1300k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              11475 non-null  object 
 1   known_for_department  83903 non-null  object 
 2   deathday              2377 non-null   object 
 3   id                    83969 non-null  float64
 4   name                  83969 non-null  object 
 5   also_known_as         83969 non-null  object 
 6   gender                83969 non-null  float64
 7   biography             83969 non-null  object 
 8   popularity            83969 non-null  float64
 9   place_of_birth        9323 non-null   object 
 10  profile_path          9154 non-null   object 
 11  adult                 83969 non-null  float64
 12  imdb_id               83969 non-null  object 
 13  homepage              1747 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [67]:
df_1300k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Writing,,Writing,Acting,Sound
deathday,,,,,
id,1.2e+06,,1.2e+06,1.2e+06,1.20000e+06
name,C. Balachandran,,Martin Prakatt,Vinu Kiriath,Euphoria
also_known_as,[],,[],[],[]
gender,0,,0,0,0
biography,,,,,
popularity,0.6,,0.6,0.6,0.6
place_of_birth,,,,,


In [68]:
df_1300k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,1299996.0,Vítor Ninéu,[],0.0,,0.6,,,0.0,nm2629452,
99996,,Acting,,1299997.0,Laura Ouakil,[],0.0,,0.6,,,0.0,,
99997,,Directing,,1299998.0,Beatriz Sanchís,[],0.0,,0.6,"Spain, Spain",,0.0,nm3808149,
99998,,Writing,,1299999.0,Youssef El Sebai,[],0.0,,0.6,,,0.0,nm0252757,
99999,,Directing,,1300000.0,Dana Ben-Ari,[],0.0,,0.6,,,0.0,nm5125984,


In [69]:
df = pd.DataFrame(get_people(num=1400000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              8570 non-null   object 
 1   known_for_department  91393 non-null  object 
 2   deathday              2590 non-null   object 
 3   id                    91418 non-null  float64
 4   name                  91418 non-null  object 
 5   also_known_as         91418 non-null  object 
 6   gender                91418 non-null  float64
 7   biography             91418 non-null  object 
 8   popularity            91418 non-null  float64
 9   place_of_birth        6818 non-null   object 
 10  profile_path          7194 non-null   object 
 11  adult                 91418 non-null  object 
 12  imdb_id               91418 non-null  object 
 13  homepage              1326 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [70]:
#df.to_json('tmdb_person_1400k.json')

In [71]:
df_1400k = pd.read_json('tmdb_person_1400k.json')
df_1400k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              8570 non-null   object 
 1   known_for_department  91393 non-null  object 
 2   deathday              2590 non-null   object 
 3   id                    91418 non-null  float64
 4   name                  91418 non-null  object 
 5   also_known_as         91418 non-null  object 
 6   gender                91418 non-null  float64
 7   biography             91418 non-null  object 
 8   popularity            91418 non-null  float64
 9   place_of_birth        6818 non-null   object 
 10  profile_path          7194 non-null   object 
 11  adult                 91418 non-null  float64
 12  imdb_id               91418 non-null  object 
 13  homepage              1326 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [72]:
df_1400k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Editing,Acting,Sound,,Editing
deathday,,,,,
id,1.3e+06,1.3e+06,1.3e+06,,1.30000e+06
name,João Carlos Gorjão,Manuel Marcelino,Carlos Seixas,,Manuela Gorjão
also_known_as,[],[],[],,[]
gender,2,0,0,,0
biography,,,,,
popularity,0.6,0.6,0.6,,0.6
place_of_birth,,,,,


In [73]:
df_1400k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Sound,,1399996.0,Robert J. Litt,[],2.0,,1.38,,,0.0,nm0514448,
99996,,Sound,,1399997.0,Elliot Tyson,[],2.0,,0.98,,,0.0,nm0006525,
99997,,Crew,,1399998.0,Harry Madsen,[],2.0,,0.6,,,0.0,nm0535207,
99998,,Camera,,1399999.0,Michael Levine,[],0.0,,0.6,,,0.0,nm0505898,
99999,1939-07-20,Camera,2000-05-24,1400000.0,Mike Roberts,[],0.0,,0.6,Woking - Surrey - England - UK,,0.0,,


In [74]:
df = pd.DataFrame(get_people(num=1500000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6597 non-null   object 
 1   known_for_department  93862 non-null  object 
 2   deathday              1444 non-null   object 
 3   id                    93880 non-null  float64
 4   name                  93880 non-null  object 
 5   also_known_as         93880 non-null  object 
 6   gender                93880 non-null  float64
 7   biography             93880 non-null  object 
 8   popularity            93880 non-null  float64
 9   place_of_birth        5880 non-null   object 
 10  profile_path          6571 non-null   object 
 11  adult                 93880 non-null  object 
 12  imdb_id               93879 non-null  object 
 13  homepage              1351 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [75]:
#df.to_json('tmdb_person_1500k.json')

In [76]:
df_1500k = pd.read_json('tmdb_person_1500k.json')
df_1500k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              6597 non-null   object 
 1   known_for_department  93862 non-null  object 
 2   deathday              1444 non-null   object 
 3   id                    93880 non-null  float64
 4   name                  93880 non-null  object 
 5   also_known_as         93880 non-null  object 
 6   gender                93880 non-null  float64
 7   biography             93880 non-null  object 
 8   popularity            93880 non-null  float64
 9   place_of_birth        5880 non-null   object 
 10  profile_path          6571 non-null   object 
 11  adult                 93880 non-null  float64
 12  imdb_id               93879 non-null  object 
 13  homepage              1351 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [77]:
df_1500k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,1959-09-15,,,
known_for_department,Camera,Crew,Crew,Production,Production
deathday,,1998-07-17,,,
id,1.4e+06,1.4e+06,1.4e+06,1.4e+06,1.40000e+06
name,George Kontaxis,Alvin Milliken,Louis Falco,Anna Wright,Yuuri Sunohara
also_known_as,[],"[Alvin Milliken Jr., Al Milliken]",[],[],"[Youri Sunohara, 春原悠理]"
gender,0,2,0,0,1
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,"Dallas, Texas, USA",,,


In [78]:
df_1500k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Directing,,1499996.0,Vahe Gabuchian,[],2.0,,0.6,,,0.0,nm4315875,
99996,,Writing,,1499997.0,Jorge Cham,[],0.0,,0.6,,,0.0,nm4687472,
99997,,Production,,1499998.0,Margaret Rosenburg,[],0.0,,0.6,,,0.0,nm5889739,
99998,,Acting,,1499999.0,William Marshall,[],0.0,,0.6,,,0.0,nm6801964,
99999,,Crew,,1500000.0,Lucie Adalind,[],0.0,,0.6,,,0.0,nm1988911,


In [79]:
df = pd.DataFrame(get_people(num=1600000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              5894 non-null   object 
 1   known_for_department  93874 non-null  object 
 2   deathday              1192 non-null   object 
 3   id                    93894 non-null  float64
 4   name                  93894 non-null  object 
 5   also_known_as         93894 non-null  object 
 6   gender                93894 non-null  float64
 7   biography             93894 non-null  object 
 8   popularity            93894 non-null  float64
 9   place_of_birth        5543 non-null   object 
 10  profile_path          6439 non-null   object 
 11  adult                 93894 non-null  object 
 12  imdb_id               93892 non-null  object 
 13  homepage              1310 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [80]:
#df.to_json('tmdb_person_1600k.json')

In [81]:
df_1600k = pd.read_json('tmdb_person_1600k.json')
df_1600k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              5894 non-null   object 
 1   known_for_department  93874 non-null  object 
 2   deathday              1192 non-null   object 
 3   id                    93894 non-null  float64
 4   name                  93894 non-null  object 
 5   also_known_as         93894 non-null  object 
 6   gender                93894 non-null  float64
 7   biography             93894 non-null  object 
 8   popularity            93894 non-null  float64
 9   place_of_birth        5543 non-null   object 
 10  profile_path          6439 non-null   object 
 11  adult                 93894 non-null  float64
 12  imdb_id               93892 non-null  object 
 13  homepage              1310 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [82]:
df_1600k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Acting,Acting,Acting,Acting,Acting
deathday,,,,,
id,1.5e+06,1.5e+06,1.5e+06,1.5e+06,1.50000e+06
name,Raj Katti,Emily Abbott,Susanna Boney,Tony Chu,Georgette Kala-Lobé
also_known_as,[],[],[],[],[]
gender,0,0,0,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [83]:
df_1600k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,1599996.0,Kazuo Aoyagi,[],0.0,,0.6,,,0.0,,
99996,,Camera,,1599997.0,Edib Ahmetašević,[],0.0,,0.6,,,0.0,,
99997,,Camera,,1599998.0,Mario Delić,[],0.0,,0.6,,,0.0,,
99998,,Acting,,1599999.0,Goro Misaki,[],0.0,,0.6,,,0.0,,
99999,,Sound,,1600000.0,Miroslav Babić,[],0.0,,0.6,,,0.0,,


In [84]:
df = pd.DataFrame(get_people(num=1700000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              5493 non-null   object 
 1   known_for_department  93367 non-null  object 
 2   deathday              999 non-null    object 
 3   id                    93393 non-null  float64
 4   name                  93393 non-null  object 
 5   also_known_as         93393 non-null  object 
 6   gender                93393 non-null  float64
 7   biography             93393 non-null  object 
 8   popularity            93393 non-null  float64
 9   place_of_birth        4863 non-null   object 
 10  profile_path          6085 non-null   object 
 11  adult                 93393 non-null  object 
 12  imdb_id               93393 non-null  object 
 13  homepage              1144 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [85]:
#df.to_json('tmdb_person_1700k.json')

In [86]:
df_1700k = pd.read_json('tmdb_person_1700k.json')
df_1700k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              5493 non-null   object 
 1   known_for_department  93367 non-null  object 
 2   deathday              999 non-null    object 
 3   id                    93393 non-null  float64
 4   name                  93393 non-null  object 
 5   also_known_as         93393 non-null  object 
 6   gender                93393 non-null  float64
 7   biography             93393 non-null  object 
 8   popularity            93393 non-null  float64
 9   place_of_birth        4863 non-null   object 
 10  profile_path          6085 non-null   object 
 11  adult                 93393 non-null  float64
 12  imdb_id               93393 non-null  object 
 13  homepage              1144 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [87]:
df_1700k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Acting,Costume & Make-Up,Acting,Camera,Acting
deathday,,,,,
id,1.6e+06,1.6e+06,1.6e+06,1.6e+06,1.60000e+06
name,Yuichi Shima,Jana Schulze,Keiko Nakano,Shuji Sasano,Peter Schütze
also_known_as,[],[],[],[笹野修司],[]
gender,0,0,0,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [88]:
df_1700k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,1699996.0,Maria Galant,[],0.0,,0.6,,,0.0,,
99996,,Acting,,1699997.0,Maria Galant,[],0.0,,0.6,,,0.0,,
99997,,Writing,,1699998.0,Uichiro Kitazato,[],0.0,,0.6,,,0.0,,
99998,,Acting,,1699999.0,Fabiana Amorim,[],0.0,,0.6,,,0.0,,
99999,,Camera,,1700000.0,Yasuhiko Mitsui,[],0.0,,0.6,,,0.0,,


In [89]:
df = pd.DataFrame(get_people(num=1800000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4341 non-null   object 
 1   known_for_department  93070 non-null  object 
 2   deathday              778 non-null    object 
 3   id                    93095 non-null  float64
 4   name                  93095 non-null  object 
 5   also_known_as         93095 non-null  object 
 6   gender                93095 non-null  float64
 7   biography             93095 non-null  object 
 8   popularity            93095 non-null  float64
 9   place_of_birth        4132 non-null   object 
 10  profile_path          6543 non-null   object 
 11  adult                 93095 non-null  object 
 12  imdb_id               93095 non-null  object 
 13  homepage              964 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [90]:
#df.to_json('tmdb_person_1800k.json')

In [91]:
df_1800k = pd.read_json('tmdb_person_1800k.json')
df_1800k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4341 non-null   object 
 1   known_for_department  93070 non-null  object 
 2   deathday              778 non-null    object 
 3   id                    93095 non-null  float64
 4   name                  93095 non-null  object 
 5   also_known_as         93095 non-null  object 
 6   gender                93095 non-null  float64
 7   biography             93095 non-null  object 
 8   popularity            93095 non-null  float64
 9   place_of_birth        4132 non-null   object 
 10  profile_path          6543 non-null   object 
 11  adult                 93095 non-null  float64
 12  imdb_id               93095 non-null  object 
 13  homepage              964 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [92]:
df_1800k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Directing,Production,Production,Production,Editing
deathday,,,,,
id,1.7e+06,1.7e+06,1.7e+06,1.7e+06,1.70000e+06
name,Dario Albertini,Olaf B. Boorsma,Ger Loogman,Yvonne Dahlhaus,Kees Thies
also_known_as,[],[],[],[],[]
gender,0,0,0,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [93]:
df_1800k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,1799996.0,Lonnie Duran Tackett,[],0.0,,0.6,,,0.0,,
99996,,Acting,,1799997.0,Mackie Richerson,[],0.0,,0.6,,,0.0,,
99997,,Acting,,1799998.0,Jack Christopher,[],0.0,,0.6,,,0.0,,
99998,,Acting,,1799999.0,Sean T. James,[],0.0,,1.38,,,0.0,,
99999,,Acting,,1800000.0,Larry Reynosa,[],0.0,,0.6,,,0.0,,


In [94]:
df = pd.DataFrame(get_people(num=1900000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4568 non-null   object 
 1   known_for_department  94358 non-null  object 
 2   deathday              743 non-null    object 
 3   id                    94381 non-null  float64
 4   name                  94381 non-null  object 
 5   also_known_as         94381 non-null  object 
 6   gender                94381 non-null  float64
 7   biography             94381 non-null  object 
 8   popularity            94381 non-null  float64
 9   place_of_birth        4462 non-null   object 
 10  profile_path          9558 non-null   object 
 11  adult                 94381 non-null  object 
 12  imdb_id               94381 non-null  object 
 13  homepage              1040 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [95]:
#df.to_json('tmdb_person_1900k.json')

In [96]:
df_1900k = pd.read_json('tmdb_person_1900k.json')
df_1900k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4568 non-null   object 
 1   known_for_department  94358 non-null  object 
 2   deathday              743 non-null    object 
 3   id                    94381 non-null  float64
 4   name                  94381 non-null  object 
 5   also_known_as         94381 non-null  object 
 6   gender                94381 non-null  float64
 7   biography             94381 non-null  object 
 8   popularity            94381 non-null  float64
 9   place_of_birth        4462 non-null   object 
 10  profile_path          9558 non-null   object 
 11  adult                 94381 non-null  float64
 12  imdb_id               94381 non-null  object 
 13  homepage              1040 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [97]:
df_1900k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,1955-06-28,,
known_for_department,Acting,Production,Production,Creator,Acting
deathday,,,,,
id,1.8e+06,1.8e+06,1.8e+06,1.8e+06,1.80000e+06
name,Maridean Mansfield Shepard,Christine Rhodes,Alan Forbes,Michael von Mossner,Rita Silva
also_known_as,[],[],[],[],[]
gender,1,1,0,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,"Rockford, Illinois, USA",,


In [98]:
df_1900k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,1899996.0,Chaiyakorn Dejkajornwut,[],0.0,,0.6,,,0.0,,
99996,,Acting,,1899997.0,Tanachai Kungcharonetanachot,[],0.0,,0.6,,,0.0,,
99997,,Acting,,1899998.0,Anthony Marcacci,[],0.0,,0.6,,,0.0,,
99998,,Acting,,1899999.0,Dep Kirkland,[],2.0,,0.6,,,0.0,nm1625522,
99999,,Acting,,1900000.0,Sawyer Shipman,[],0.0,,0.6,,,0.0,,


In [99]:
df = pd.DataFrame(get_people(num=2000000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4112 non-null   object 
 1   known_for_department  94191 non-null  object 
 2   deathday              650 non-null    object 
 3   id                    94221 non-null  float64
 4   name                  94221 non-null  object 
 5   also_known_as         94221 non-null  object 
 6   gender                94221 non-null  float64
 7   biography             94221 non-null  object 
 8   popularity            94221 non-null  float64
 9   place_of_birth        3804 non-null   object 
 10  profile_path          7619 non-null   object 
 11  adult                 94221 non-null  object 
 12  imdb_id               94221 non-null  object 
 13  homepage              967 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [100]:
#df.to_json('tmdb_person_2000k.json')

In [101]:
df_2000k = pd.read_json('tmdb_person_2000k.json')
df_2000k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4112 non-null   object 
 1   known_for_department  94191 non-null  object 
 2   deathday              650 non-null    object 
 3   id                    94221 non-null  float64
 4   name                  94221 non-null  object 
 5   also_known_as         94221 non-null  object 
 6   gender                94221 non-null  float64
 7   biography             94221 non-null  object 
 8   popularity            94221 non-null  float64
 9   place_of_birth        3804 non-null   object 
 10  profile_path          7619 non-null   object 
 11  adult                 94221 non-null  float64
 12  imdb_id               94221 non-null  object 
 13  homepage              967 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [102]:
df_2000k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Acting,,Acting,,Acting
deathday,,,,,
id,1.9e+06,,1.9e+06,,1.90000e+06
name,Mary Stofle,,Hunter Sanchez,,Mary Reber
also_known_as,[],,[],,[]
gender,1,,2,,1
biography,,,,,
popularity,0.6,,0.6,,0.6
place_of_birth,,,,,


In [103]:
df_2000k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,1924-02-03,Acting,2011-11-03,1999996.0,Regina Heinonen,[],1.0,,0.6,Finland,/or3rxhVDIUq0X73ifLF0W6CWgOS.jpg,0.0,,
99996,,Sound,,1999997.0,Ashwin Gopakumar,[],0.0,,0.6,,,0.0,,
99997,,Sound,,1999998.0,Niranj Suresh,[],0.0,,0.6,,,0.0,,
99998,,Sound,,1999999.0,Manikandan Ayyappa,[],0.0,,0.6,,,0.0,,
99999,,,,,,,,,,,,,,


In [104]:
df = pd.DataFrame(get_people(num=2100000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4100 non-null   object 
 1   known_for_department  94113 non-null  object 
 2   deathday              668 non-null    object 
 3   id                    94176 non-null  float64
 4   name                  94176 non-null  object 
 5   also_known_as         94176 non-null  object 
 6   gender                94176 non-null  float64
 7   biography             94176 non-null  object 
 8   popularity            94176 non-null  float64
 9   place_of_birth        3644 non-null   object 
 10  profile_path          7369 non-null   object 
 11  adult                 94176 non-null  object 
 12  imdb_id               94176 non-null  object 
 13  homepage              749 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [105]:
#df.to_json('tmdb_person_2100k.json')

In [106]:
df_2100k = pd.read_json('tmdb_person_2100k.json')
df_2100k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4100 non-null   object 
 1   known_for_department  94113 non-null  object 
 2   deathday              668 non-null    object 
 3   id                    94176 non-null  float64
 4   name                  94176 non-null  object 
 5   also_known_as         94176 non-null  object 
 6   gender                94176 non-null  float64
 7   biography             94176 non-null  object 
 8   popularity            94176 non-null  float64
 9   place_of_birth        3644 non-null   object 
 10  profile_path          7369 non-null   object 
 11  adult                 94176 non-null  float64
 12  imdb_id               94176 non-null  object 
 13  homepage              749 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [107]:
df_2100k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Sound,Acting,Acting,Acting,Acting
deathday,,,,,
id,2e+06,2e+06,2e+06,2e+06,2.00000e+06
name,Shebin Mathew,Claron McFadden,Bernarda Fink,Christoph Genz,Sherilyn Baird
also_known_as,[],[],[],[],[]
gender,0,0,0,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.828
place_of_birth,,,,,


In [108]:
df_2100k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Editing,,2099996.0,Laura Cardona,[],0.0,,0.6,,,0.0,,
99996,,Acting,,2099997.0,Milana Alrayes,[],0.0,,0.6,,,0.0,,
99997,,Acting,,2099998.0,Rylee Whiteman,[],0.0,,0.6,,,0.0,,
99998,,Acting,,2099999.0,Aubrey Michele Katz,[],0.0,,0.6,,,0.0,,
99999,,Production,,2100000.0,Susan Benaroya,[],0.0,,0.6,,,0.0,,


In [109]:
df = pd.DataFrame(get_people(num=2200000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              3052 non-null   object 
 1   known_for_department  94606 non-null  object 
 2   deathday              415 non-null    object 
 3   id                    94628 non-null  float64
 4   name                  94628 non-null  object 
 5   also_known_as         94628 non-null  object 
 6   gender                94628 non-null  float64
 7   biography             94628 non-null  object 
 8   popularity            94628 non-null  float64
 9   place_of_birth        2879 non-null   object 
 10  profile_path          8393 non-null   object 
 11  adult                 94628 non-null  object 
 12  imdb_id               94628 non-null  object 
 13  homepage              797 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [110]:
#df.to_json('tmdb_person_2200k.json')

In [111]:
df_2200k = pd.read_json('tmdb_person_2200k.json')
df_2200k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              3052 non-null   object 
 1   known_for_department  94606 non-null  object 
 2   deathday              415 non-null    object 
 3   id                    94628 non-null  float64
 4   name                  94628 non-null  object 
 5   also_known_as         94628 non-null  object 
 6   gender                94628 non-null  float64
 7   biography             94628 non-null  object 
 8   popularity            94628 non-null  float64
 9   place_of_birth        2879 non-null   object 
 10  profile_path          8393 non-null   object 
 11  adult                 94628 non-null  float64
 12  imdb_id               94628 non-null  object 
 13  homepage              797 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [112]:
df_2200k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,Acting,Production,Production,Acting
deathday,,,,,
id,,2.1e+06,2.1e+06,2.1e+06,2.10000e+06
name,,Park Kyoung-hee,Nicki Holtzman,Sonia Dulay,André Ceccato
also_known_as,,[],[],[Sonia Dulay Ricci],[]
gender,,0,0,0,0
biography,,,,,
popularity,,1.4,0.6,0.6,0.6
place_of_birth,,,,,


In [113]:
df_2200k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,1992-06-16,Acting,,2199996.0,Jéssica Ellen,[],1.0,,0.6,"Rio de Janeiro, Rio de Janeiro, Brazil",/klLAOUgXGmNwGfrvmc5rTzxwxDi.jpg,0.0,nm5221691,
99996,,,,,,,,,,,,,,
99997,,Directing,,2199998.0,François-Xavier Destors,[],0.0,,0.6,,,0.0,,
99998,,,,,,,,,,,,,,
99999,,,,,,,,,,,,,,


In [114]:
df = pd.DataFrame(get_people(num=2300000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              3521 non-null   object 
 1   known_for_department  94189 non-null  object 
 2   deathday              581 non-null    object 
 3   id                    94218 non-null  float64
 4   name                  94218 non-null  object 
 5   also_known_as         94218 non-null  object 
 6   gender                94218 non-null  float64
 7   biography             94218 non-null  object 
 8   popularity            94218 non-null  float64
 9   place_of_birth        3362 non-null   object 
 10  profile_path          7134 non-null   object 
 11  adult                 94218 non-null  object 
 12  imdb_id               94218 non-null  object 
 13  homepage              775 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [115]:
#df.to_json('tmdb_person_2300k.json')

In [116]:
df_2300k = pd.read_json('tmdb_person_2300k.json')
df_2300k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              3521 non-null   object 
 1   known_for_department  94189 non-null  object 
 2   deathday              581 non-null    object 
 3   id                    94218 non-null  float64
 4   name                  94218 non-null  object 
 5   also_known_as         94218 non-null  object 
 6   gender                94218 non-null  float64
 7   biography             94218 non-null  object 
 8   popularity            94218 non-null  float64
 9   place_of_birth        3362 non-null   object 
 10  profile_path          7134 non-null   object 
 11  adult                 94218 non-null  float64
 12  imdb_id               94218 non-null  object 
 13  homepage              775 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [117]:
df_2300k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,,Directing,Acting,Costume & Make-Up,Crew
deathday,,,,,
id,,2.2e+06,2.2e+06,2.2e+06,2.20000e+06
name,,Ion Indolean,Yuki Takahashi,"""Scotti"" Scott",Troy Fromin
also_known_as,,[],[],[],[]
gender,,0,0,0,0
biography,,,,,
popularity,,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [118]:
df_2300k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Production,,2299996.0,Witold Będkowski,[],0.0,,0.6,,,0.0,,
99996,,Production,,2299997.0,Piotr Kielar,[],0.0,,0.6,,,0.0,,
99997,,Production,,2299998.0,Rafał Szymański,[],0.0,,0.6,,,0.0,,
99998,,Sound,,2299999.0,Paweł Drzyzga,[],0.0,,0.6,,,0.0,,
99999,,Sound,,2300000.0,Jakub Wanago,[],0.0,,0.6,,,0.0,,


In [119]:
df = pd.DataFrame(get_people(num=2400000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4417 non-null   object 
 1   known_for_department  95295 non-null  object 
 2   deathday              741 non-null    object 
 3   id                    95325 non-null  float64
 4   name                  95325 non-null  object 
 5   also_known_as         95325 non-null  object 
 6   gender                95325 non-null  float64
 7   biography             95325 non-null  object 
 8   popularity            95325 non-null  float64
 9   place_of_birth        4105 non-null   object 
 10  profile_path          6441 non-null   object 
 11  adult                 95325 non-null  object 
 12  imdb_id               95325 non-null  object 
 13  homepage              1068 non-null   object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [120]:
#df.to_json('tmdb_person_2400k.json')

In [121]:
df_2400k = pd.read_json('tmdb_person_2400k.json')
df_2400k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              4417 non-null   object 
 1   known_for_department  95295 non-null  object 
 2   deathday              741 non-null    object 
 3   id                    95325 non-null  float64
 4   name                  95325 non-null  object 
 5   also_known_as         95325 non-null  object 
 6   gender                95325 non-null  float64
 7   biography             95325 non-null  object 
 8   popularity            95325 non-null  float64
 9   place_of_birth        4105 non-null   object 
 10  profile_path          6441 non-null   object 
 11  adult                 95325 non-null  float64
 12  imdb_id               95325 non-null  object 
 13  homepage              1068 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [122]:
df_2400k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Costume & Make-Up,Costume & Make-Up,Crew,Crew,
deathday,,,,,
id,2.3e+06,2.3e+06,2.3e+06,2.3e+06,
name,Mayela Serrano,Joy Travis,Matt Emig,Edward Gabree,
also_known_as,[],[],[],[],
gender,0,0,0,0,
biography,,,,,
popularity,0.6,0.6,0.6,0.6,
place_of_birth,,,,,


In [123]:
df_2400k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Production,,2399996.0,Courtney Pritchett,[],0.0,,0.6,,,0.0,,
99996,,Acting,,2399997.0,Ingeborg Christensen,[],0.0,,0.6,,,0.0,,
99997,,Acting,,2399998.0,Mohanlal,[],0.0,,0.6,,,0.0,,
99998,,Acting,,2399999.0,Mirna Menon,[],0.0,,0.6,,,0.0,,
99999,,Acting,,2400000.0,Elijah Trichon,[],2.0,,0.6,,,0.0,nm3136488,


In [124]:
df = pd.DataFrame(get_people(num=2500000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              2881 non-null   object 
 1   known_for_department  95801 non-null  object 
 2   deathday              394 non-null    object 
 3   id                    95815 non-null  float64
 4   name                  95815 non-null  object 
 5   also_known_as         95815 non-null  object 
 6   gender                95815 non-null  float64
 7   biography             95815 non-null  object 
 8   popularity            95815 non-null  float64
 9   place_of_birth        2888 non-null   object 
 10  profile_path          6134 non-null   object 
 11  adult                 95815 non-null  object 
 12  imdb_id               95815 non-null  object 
 13  homepage              853 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [125]:
#df.to_json('tmdb_person_2500k.json')

In [126]:
df_2500k = pd.read_json('tmdb_person_2500k.json')
df_2500k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              2881 non-null   object 
 1   known_for_department  95801 non-null  object 
 2   deathday              394 non-null    object 
 3   id                    95815 non-null  float64
 4   name                  95815 non-null  object 
 5   also_known_as         95815 non-null  object 
 6   gender                95815 non-null  float64
 7   biography             95815 non-null  object 
 8   popularity            95815 non-null  float64
 9   place_of_birth        2888 non-null   object 
 10  profile_path          6134 non-null   object 
 11  adult                 95815 non-null  float64
 12  imdb_id               95815 non-null  object 
 13  homepage              853 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [127]:
df_2500k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,2001-08-17
known_for_department,Acting,Editing,Acting,Acting,Acting
deathday,,,,,
id,2.4e+06,2.4e+06,2.4e+06,2.4e+06,2.40000e+06
name,Dalton Day,K.R. Gaurishankar,Brandon Hender,Carl Amster,Carsen Warner
also_known_as,[],[],[],[],[]
gender,2,0,2,0,2
biography,,,,,Carsen Warner is an actor and voice over artis...
popularity,0.6,0.6,0.6,0.6,0.703
place_of_birth,"Atlanta, Georgia, USA",,,,"California, USA"


In [128]:
df_2500k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Visual Effects,,2499996.0,Jang Hyo-sun,[],0.0,,0.6,,,0.0,,
99996,,Acting,,2499997.0,Masayashi Kobashi,[],2.0,,0.6,,,0.0,,
99997,,Visual Effects,,2499998.0,Jang Mi-jin,[],0.0,,0.6,,,0.0,,
99998,,Acting,,2499999.0,Takashi Akitane,[],2.0,,0.6,,,0.0,,
99999,,Visual Effects,,2500000.0,Jo Eun-byul,[],0.0,,0.6,,,0.0,,


In [129]:
df = pd.DataFrame(get_people(num=2600000))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              2344 non-null   object 
 1   known_for_department  96130 non-null  object 
 2   deathday              509 non-null    object 
 3   id                    96155 non-null  float64
 4   name                  96155 non-null  object 
 5   also_known_as         96155 non-null  object 
 6   gender                96155 non-null  float64
 7   biography             96155 non-null  object 
 8   popularity            96155 non-null  float64
 9   place_of_birth        2134 non-null   object 
 10  profile_path          4725 non-null   object 
 11  adult                 96155 non-null  object 
 12  imdb_id               96155 non-null  object 
 13  homepage              690 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [130]:
#df.to_json('tmdb_person_2600k.json')

In [131]:
df_2600k = pd.read_json('tmdb_person_2600k.json')
df_2600k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              2344 non-null   object 
 1   known_for_department  96130 non-null  object 
 2   deathday              509 non-null    object 
 3   id                    96155 non-null  float64
 4   name                  96155 non-null  object 
 5   also_known_as         96155 non-null  object 
 6   gender                96155 non-null  float64
 7   biography             96155 non-null  object 
 8   popularity            96155 non-null  float64
 9   place_of_birth        2134 non-null   object 
 10  profile_path          4725 non-null   object 
 11  adult                 96155 non-null  float64
 12  imdb_id               96155 non-null  object 
 13  homepage              690 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [132]:
df_2600k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,1869-10-22,
known_for_department,Acting,Crew,Visual Effects,Writing,Visual Effects
deathday,,,,1943-11-26,
id,2.5e+06,2.5e+06,2.5e+06,2.5e+06,2.50000e+06
name,Mone Sawada,Jung Seock-hee,Jung Da-som,Josef Hobl,Jung Ji-hyung
also_known_as,[],"[정석희, Seock Hee Joung, Jung Seok-hee]",[],[],[]
gender,0,2,0,2,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [133]:
df_2600k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Acting,,2599996.0,Heiko Röcher,[],0.0,,0.6,,,0.0,,
99996,,Directing,,2599997.0,Mark Row,[],0.0,,0.6,,,0.0,,
99997,,Production,,2599998.0,Tim MacDonald,[],0.0,,0.6,,,0.0,,
99998,,Production,,2599999.0,Iain Row,[],0.0,,0.6,,,0.0,,
99999,,Acting,,2600000.0,Miloslav Vajnar,[],2.0,,0.6,,,0.0,nm7650887,


In [134]:
df = pd.DataFrame(get_people(num=2618075))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              2230 non-null   object 
 1   known_for_department  96777 non-null  object 
 2   deathday              448 non-null    object 
 3   id                    96803 non-null  float64
 4   name                  96803 non-null  object 
 5   also_known_as         96803 non-null  object 
 6   gender                96803 non-null  float64
 7   biography             96803 non-null  object 
 8   popularity            96803 non-null  float64
 9   place_of_birth        2060 non-null   object 
 10  profile_path          4976 non-null   object 
 11  adult                 96803 non-null  object 
 12  imdb_id               96803 non-null  object 
 13  homepage              824 non-null    object 
dtypes: float64(3), object(11)
memory usage: 10.7+ MB


In [135]:
#df.to_json('tmdb_person_2700k.json')

In [136]:
df_2700k = pd.read_json('tmdb_person_2700k.json')
df_2700k.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              2230 non-null   object 
 1   known_for_department  96777 non-null  object 
 2   deathday              448 non-null    object 
 3   id                    96803 non-null  float64
 4   name                  96803 non-null  object 
 5   also_known_as         96803 non-null  object 
 6   gender                96803 non-null  float64
 7   biography             96803 non-null  object 
 8   popularity            96803 non-null  float64
 9   place_of_birth        2060 non-null   object 
 10  profile_path          4976 non-null   object 
 11  adult                 96803 non-null  float64
 12  imdb_id               96803 non-null  object 
 13  homepage              824 non-null    object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [137]:
df_2700k.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,,,,,
known_for_department,Acting,Acting,Acting,Acting,Acting
deathday,,,,,
id,2.51808e+06,2.51808e+06,2.51808e+06,2.51808e+06,2.51808e+06
name,David Rau,Getenesh Berhe,Leo Kleiber,Finn Mai,Andreas Heermann
also_known_as,[],[],[],[],[]
gender,0,0,0,0,0
biography,,,,,
popularity,0.6,0.6,0.6,0.6,0.6
place_of_birth,,,,,


In [138]:
df_2700k.tail()

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
99995,,Costume & Make-Up,,2618071.0,Daphne Xiaoru Jia,[],0.0,,0.6,,,0.0,,
99996,,Acting,,2618072.0,Emmanuel Camacho,[],0.0,,0.6,,,0.0,,
99997,,Directing,,2618073.0,Matthew Boda,[],0.0,,0.6,,,0.0,,
99998,,Writing,,2618074.0,Alessandro Bilotta,[],0.0,,0.6,,,0.0,,
99999,,Directing,,2618075.0,Jelena Jovčić,[],0.0,,0.6,,,0.0,,


In [142]:
df_100k

Unnamed: 0,birthday,known_for_department,deathday,id,name,also_known_as,gender,biography,popularity,place_of_birth,profile_path,adult,imdb_id,homepage
0,1944-05-14,Directing,,1.0,George Lucas,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...",2.0,"George Walton Lucas Jr. (born May 14, 1944) is...",6.761,"Modesto, California, USA",/mDLDvsx8PaZoEThkBdyaG1JxPdf.jpg,0.0,nm0000184,
1,1951-09-25,Acting,,2.0,Mark Hamill,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...",2.0,"Mark Richard Hamill (born September 25, 1951) ...",10.497,"Concord, California, USA",/fk8OfdReNltKZqOk2TZgkofCUFq.jpg,0.0,nm0000434,
2,1942-07-13,Acting,,3.0,Harrison Ford,"[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...",2.0,Legendary Hollywood Icon Harrison Ford was bor...,20.349,"Chicago, Illinois, USA",/5M7oN3sznp99hWYQ9sX0xheswWX.jpg,0.0,nm0000148,
3,1956-10-21,Acting,2016-12-27,4.0,Carrie Fisher,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",1.0,Carrie Frances Fisher (21 October 1956 - 27 De...,7.228,"Beverly Hills, Los Angeles, California, USA",/rfJtncHewKVnHjqpIZvjn24ESeC.jpg,0.0,nm0000402,https://carriefisher.com/
4,1913-05-26,Acting,1994-08-11,5.0,Peter Cushing,[Peter Wilton Cushing],2.0,"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",3.902,"Kenley, Surrey, England, UK",/1qtKVu16REL2YLVrhayjVey4al.jpg,0.0,nm0001088,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,,Acting,,99996.0,Scooter McCrae,[],0.0,,0.631,,,0.0,nm0566885,
99996,,Directing,,99997.0,Eric Thornett,[],0.0,,0.600,,,0.0,nm1204419,
99997,,Acting,,99998.0,Jason Wauer,[],0.0,,0.600,,,0.0,nm1312267,
99998,,Acting,,99999.0,Mr. ?,[],0.0,,0.600,,,0.0,nm2425757,


In [139]:
raise SystemExit("Stop right there!")

SystemExit: Stop right there!

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# last_person = 2618075

def get_people():
    """Makes requests for all person objects in TMDb database and stores them in a list of JSON's"""
    people_list = []
    for val in range(11, 20):
        print(val)
        clear_output(wait=True)
        url = base_url + 'person/' + str(val) + '?api_key=' + my_key
        response = requests.get(url)
        if response.status_code != 200:
            continue
        people_list.append(response.json())
    return people_list

In [None]:
#pd.DataFrame(get_people()).to_json('tmdb_person_objects.json')