# Early Data Exploration and API Queries of TMDB.org

In [None]:
import pandas as pd
import numpy as np

import os
import configparser
import requests

import datetime
import time
import json

In [2]:
config = configparser.ConfigParser()
config.read('my_config.cfg', encoding='utf-8-sig')

os.environ['TMDB_API_KEY']=config['TMDB CREDS']['TMDB_API_KEY']

In [314]:
# Set Pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
# pd.set_option('display.precision', 3)

## 1) Downloaded csv Files

In [4]:
links = pd.read_csv('datasets/links.csv')
print(links.shape)
print(links['imdbId'].min())
print(links['imdbId'].max())
links.head()

(58098, 3)
1
9038520


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   imdbId   58098 non-null  int64  
 2   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.3 MB


In [6]:
# Need to drop the tmdbId=NaN rows since we'll be querying TMDB by this id 

links.dropna(axis=0, how='any', inplace=True)
links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57917 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  57917 non-null  int64  
 1   imdbId   57917 non-null  int64  
 2   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.8 MB


In [7]:
# Can also drop the imdbId column; will be picking it up below in the proper format
links = links.drop(columns='imdbId')
links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57917 entries, 0 to 58097
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  57917 non-null  int64  
 1   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.3 MB


In [8]:
movies = pd.read_csv('datasets/movies.csv')
print(movies.shape)
movies.head()

(58098, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# Add the links to the movies dataframe

movies = movies.merge(links, how='right', on='movieId')
print(movies.shape)
movies.head()

(57917, 4)


Unnamed: 0,movieId,title,genres,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy,11862.0


In [10]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57917 entries, 0 to 57916
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  57917 non-null  int64  
 1   title    57917 non-null  object 
 2   genres   57917 non-null  object 
 3   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 2.2+ MB


In [196]:
ratings = pd.read_csv('datasets/ratings.csv')
print(ratings.shape)
ratings = ratings[~ratings.isna().any(axis=1)]  # Drop any NaN rows
print(ratings.shape)
ratings.head()

(27753444, 4)
(27753444, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [197]:
tags = pd.read_csv('datasets/tags.csv')
print(tags.shape)
tags = tags[~tags.isna().any(axis=1)]  # Drop any NaN rows
tags['tag'] = tags['tag'].str.lower()
print(tags.shape)
tags.head()

(1108997, 4)
(1108981, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


In [198]:
# How many movies are in the ratings & tags?

print('{} out of the total {} movies are covered by the {} ratings.'
      .format(ratings['movieId'].unique().shape[0],
              movies['movieId'].unique().shape[0],
              ratings.shape[0]
             ))

print('{} out of the total {} movies are covered by the {} tags.'
      .format(tags['movieId'].unique().shape[0],
              movies['movieId'].unique().shape[0],
              tags.shape[0]
             ))

53889 out of the total 57160 movies are covered by the 27753444 ratings.
45981 out of the total 57160 movies are covered by the 1108981 tags.


## 2) Now, we should add some things to movies via API query to TMDB (Maybe Director, Box Office, Star(s)?, Release Date, etc.)

In [48]:
## Query TMDB for cast/crew of every movie

responses = []
i = 0
total_movies = 0
full_count = len(movies['tmdbId'].unique())

start = time.time()

for movie in movies['tmdbId'].unique():
    response = requests.get('https://api.themoviedb.org/3/movie/' + str(int(movie)) + '/credits?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('credits.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total_movies += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total movies out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total_movies, 
                  full_count, 
                  round(100*total_movies/full_count, 1)
              )
             )

with open('credits.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(end - start)

1.5 minutes so far (1000 total movies out of 57877 queried: ~1.7% done).
4.6 minutes so far (2000 total movies out of 57877 queried: ~3.5% done).
7.6 minutes so far (3000 total movies out of 57877 queried: ~5.2% done).
10.6 minutes so far (4000 total movies out of 57877 queried: ~6.9% done).
13.6 minutes so far (5000 total movies out of 57877 queried: ~8.6% done).
16.6 minutes so far (6000 total movies out of 57877 queried: ~10.4% done).
19.6 minutes so far (7000 total movies out of 57877 queried: ~12.1% done).
22.6 minutes so far (8000 total movies out of 57877 queried: ~13.8% done).
25.7 minutes so far (9000 total movies out of 57877 queried: ~15.6% done).
28.8 minutes so far (10000 total movies out of 57877 queried: ~17.3% done).
31.8 minutes so far (11000 total movies out of 57877 queried: ~19.0% done).
34.9 minutes so far (12000 total movies out of 57877 queried: ~20.7% done).
37.9 minutes so far (13000 total movies out of 57877 queried: ~22.5% done).
41.0 minutes so far (14000 to

In [27]:
# Note: the file as created above (by appending every 1000 movies to the file) has the unintended
#       effect of closing the list and starting a new list (i.e. putting '][' into the file where ','
#       would have been appropriate; so, you must find and replace on those patterns:
#       i.e., replace '][' with ',' using SublimeText)

temp = pd.read_json('credits.json')
temp.shape

(57877, 6)

In [28]:
temp.head()

Unnamed: 0,id,cast,crew,success,status_code,status_message
0,862.0,"[{'adult': False, 'gender': 2, 'id': 31, 'know...","[{'adult': False, 'gender': 2, 'id': 7, 'known...",,,
1,8844.0,"[{'adult': False, 'gender': 2, 'id': 2157, 'kn...","[{'adult': False, 'gender': 2, 'id': 511, 'kno...",,,
2,15602.0,"[{'adult': False, 'gender': 2, 'id': 6837, 'kn...","[{'adult': False, 'gender': 2, 'id': 3117, 'kn...",,,
3,31357.0,"[{'adult': False, 'gender': 1, 'id': 8851, 'kn...","[{'adult': False, 'gender': 2, 'id': 2178, 'kn...",,,
4,11862.0,"[{'adult': False, 'gender': 2, 'id': 67773, 'k...","[{'adult': False, 'gender': 2, 'id': 37, 'know...",,,


In [29]:
failed_queries = np.setdiff1d(movies['tmdbId'].unique(), temp['id'].unique()[~(np.isnan(temp['id'].unique()))])
failed_queries.shape

(717,)

In [30]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57877 entries, 0 to 57876
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              57160 non-null  float64
 1   cast            57160 non-null  object 
 2   crew            57160 non-null  object 
 3   success         717 non-null    float64
 4   status_code     717 non-null    float64
 5   status_message  717 non-null    object 
dtypes: float64(3), object(3)
memory usage: 2.6+ MB


In [401]:
# Run query again to get the failed queries (hopefully they all work this time)

responses = []
i = 0

start = time.time()

for movie in failed_queries:
    response = requests.get('https://api.themoviedb.org/3/movie/' + str(int(movie)) + '/credits?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())

with open('credits2.json', 'w') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(end - start)

123.21629786491394


In [31]:
temp2 = pd.read_json('credits2.json')
print(temp2.shape)
temp2.info()

(717, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 717 entries, 0 to 716
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   success         717 non-null    bool  
 1   status_code     717 non-null    int64 
 2   status_message  717 non-null    object
dtypes: bool(1), int64(1), object(1)
memory usage: 12.0+ KB


In [32]:
## These all seem to be somehow missing from TMDB (i.e. no query response), so we'll have to delete these 717
#   movies from our movies dataset...
print(temp2.columns)
temp2.describe()

Index(['success', 'status_code', 'status_message'], dtype='object')


Unnamed: 0,status_code
count,717.0
mean,34.0
std,0.0
min,34.0
25%,34.0
50%,34.0
75%,34.0
max,34.0


In [33]:
print(temp.shape)
temp = temp[temp['success'].isnull()].drop(columns=['success', 'status_code', 'status_message'])
temp.reset_index(drop=True, inplace=True)
print(temp.shape)
temp.info()

(57877, 6)
(57160, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      57160 non-null  float64
 1   cast    57160 non-null  object 
 2   crew    57160 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.3+ MB


In [34]:
movies.columns

Index(['movieId', 'title', 'genres', 'tmdbId'], dtype='object')

In [35]:
# Retain only the movies in movies dataset that are contained in temp

print(movies.shape)
movies = movies[movies['tmdbId'].isin(temp['id'])]
movies.reset_index(drop=True, inplace=True)
print(movies.shape)

(57917, 4)
(57199, 4)


In [36]:
# The movies dataset has duplicates! Drop them

print(movies[movies.duplicated(subset=['tmdbId'])].shape)
movies = movies.drop_duplicates(subset=['tmdbId'], keep='first')
movies.reset_index(drop=True, inplace=True)
print(movies.shape)

(39, 4)
(57160, 4)


In [37]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  57160 non-null  int64  
 1   title    57160 non-null  object 
 2   genres   57160 non-null  object 
 3   tmdbId   57160 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.7+ MB


In [38]:
movies.head()

Unnamed: 0,movieId,title,genres,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0
4,5,Father of the Bride Part II (1995),Comedy,11862.0


In [39]:
print(temp.shape)
temp.columns

(57160, 3)


Index(['id', 'cast', 'crew'], dtype='object')

In [40]:
# from pandas.io.json import json_normalize

cast = temp['cast'].values.tolist()
len(cast)

57160

In [41]:
crew = temp['crew'].values.tolist()
len(crew)

57160

In [42]:
# Create top-billed actor id columns (5; NaN where needed if not enough actors)

temp['actor1_id'] = np.nan # column 3
temp['actor2_id'] = np.nan
temp['actor3_id'] = np.nan
temp['actor4_id'] = np.nan
temp['actor5_id'] = np.nan
for row in range(temp.shape[0]):
    for i, entry in enumerate(cast[row][0:5]):
        temp.iloc[row, i+3] = entry['id']
temp = temp.drop(columns=['cast'])

In [43]:
temp.head()

Unnamed: 0,id,crew,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id
0,862.0,"[{'adult': False, 'gender': 2, 'id': 7, 'known...",31.0,12898.0,7167.0,12899.0,12900.0
1,8844.0,"[{'adult': False, 'gender': 2, 'id': 511, 'kno...",2157.0,205.0,145151.0,5149.0,8537.0
2,15602.0,"[{'adult': False, 'gender': 2, 'id': 3117, 'kn...",6837.0,3151.0,13567.0,16757.0,589.0
3,31357.0,"[{'adult': False, 'gender': 2, 'id': 2178, 'kn...",8851.0,9780.0,18284.0,51359.0,66804.0
4,11862.0,"[{'adult': False, 'gender': 2, 'id': 37, 'know...",67773.0,3092.0,519.0,70696.0,59222.0


In [44]:
# Create directors and writers id columns (5 for each job; NaN where needed if not enough)
#     Note: a few hundred movies have > 5 writers and/or directors; 
#           we'll just include the first 5 of either/or/both billed

temp['director1_id'] = np.nan # column 7
temp['director2_id'] = np.nan
temp['director3_id'] = np.nan
temp['director4_id'] = np.nan
temp['director5_id'] = np.nan # column 11
temp['writer1_id'] = np.nan   # column 12
temp['writer2_id'] = np.nan
temp['writer3_id'] = np.nan
temp['writer4_id'] = np.nan
temp['writer5_id'] = np.nan   #column 16

for row in range(temp.shape[0]):
    d = w = 0
    for i, entry in enumerate(crew[row]):
        if entry['job'] == 'Director':
            if d < 5:
                temp.iloc[row, d+7] = entry['id']
                d += 1
        elif (entry['job'] == 'Screenplay') | (entry['job'] == 'Writer'):
            if w < 5:
                temp.iloc[row, w+12] = entry['id']
                w += 1
temp = temp.drop(columns=['crew'])

In [45]:
temp.head()

Unnamed: 0,id,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id,director1_id,director2_id,director3_id,director4_id,director5_id,writer1_id,writer2_id,writer3_id,writer4_id,writer5_id
0,862.0,31.0,12898.0,7167.0,12899.0,12900.0,7879.0,,,,,7.0,12891.0,12892.0,12893.0,
1,8844.0,2157.0,205.0,145151.0,5149.0,8537.0,4945.0,,,,,876.0,56520.0,56521.0,,
2,15602.0,6837.0,3151.0,13567.0,16757.0,589.0,26502.0,,,,,16837.0,,,,
3,31357.0,8851.0,9780.0,18284.0,51359.0,66804.0,2178.0,,,,,5144.0,111118.0,,,
4,11862.0,67773.0,3092.0,519.0,70696.0,59222.0,56106.0,,,,,17698.0,26160.0,,,


In [479]:
# temp.drop(columns=['director1_id','director2_id','director3_id','director4_id','director5_id','writer1_id','writer2_id','writer3_id','writer4_id','writer5_id'],
#          inplace=True)
# temp.head()

In [46]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            57160 non-null  float64
 1   actor1_id     55082 non-null  float64
 2   actor2_id     53460 non-null  float64
 3   actor3_id     52462 non-null  float64
 4   actor4_id     51496 non-null  float64
 5   actor5_id     49761 non-null  float64
 6   director1_id  56705 non-null  float64
 7   director2_id  4007 non-null   float64
 8   director3_id  543 non-null    float64
 9   director4_id  240 non-null    float64
 10  director5_id  164 non-null    float64
 11  writer1_id    48257 non-null  float64
 12  writer2_id    20663 non-null  float64
 13  writer3_id    6130 non-null   float64
 14  writer4_id    1838 non-null   float64
 15  writer5_id    586 non-null    float64
dtypes: float64(16)
memory usage: 7.0 MB


In [47]:
## These NaNs seem to check out (some TMDB movies only have a title and lack cast and/or crew info)

temp[temp.isna().any(axis=1)]

Unnamed: 0,id,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id,director1_id,director2_id,director3_id,director4_id,director5_id,writer1_id,writer2_id,writer3_id,writer4_id,writer5_id
0,862.0,31.0,12898.0,7167.0,12899.0,12900.0,7879.0,,,,,7.0,12891.0,12892.0,12893.0,
1,8844.0,2157.0,205.0,145151.0,5149.0,8537.0,4945.0,,,,,876.0,56520.0,56521.0,,
2,15602.0,6837.0,3151.0,13567.0,16757.0,589.0,26502.0,,,,,16837.0,,,,
3,31357.0,8851.0,9780.0,18284.0,51359.0,66804.0,2178.0,,,,,5144.0,111118.0,,,
4,11862.0,67773.0,3092.0,519.0,70696.0,59222.0,56106.0,,,,,17698.0,26160.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57155,78251.0,543944.0,86851.0,240556.0,936092.0,1824768.0,583449.0,,,,,583449.0,,,,
57156,87558.0,54228.0,39195.0,143389.0,17500.0,20235.0,935240.0,,,,,1616581.0,,,,
57157,422666.0,80997.0,1190712.0,584045.0,1699309.0,579742.0,109055.0,,,,,1699310.0,,,,
57158,454439.0,1803735.0,112690.0,1803733.0,1079959.0,1803738.0,1803740.0,,,,,1803740.0,,,,


In [48]:
print(temp.shape)
print(movies.shape)

(57160, 16)
(57160, 4)


In [49]:
## Add the info in temp to the movies dataframe

movies = movies.merge(temp, how='left', left_on='tmdbId', right_on='id').drop(columns=['id'])

In [50]:
print(movies.shape)
movies.head()

(57160, 19)


Unnamed: 0,movieId,title,genres,tmdbId,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id,director1_id,director2_id,director3_id,director4_id,director5_id,writer1_id,writer2_id,writer3_id,writer4_id,writer5_id
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0,31.0,12898.0,7167.0,12899.0,12900.0,7879.0,,,,,7.0,12891.0,12892.0,12893.0,
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0,2157.0,205.0,145151.0,5149.0,8537.0,4945.0,,,,,876.0,56520.0,56521.0,,
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0,6837.0,3151.0,13567.0,16757.0,589.0,26502.0,,,,,16837.0,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0,8851.0,9780.0,18284.0,51359.0,66804.0,2178.0,,,,,5144.0,111118.0,,,
4,5,Father of the Bride Part II (1995),Comedy,11862.0,67773.0,3092.0,519.0,70696.0,59222.0,56106.0,,,,,17698.0,26160.0,,,


In [51]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57160 entries, 0 to 57159
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       57160 non-null  int64  
 1   title         57160 non-null  object 
 2   genres        57160 non-null  object 
 3   tmdbId        57160 non-null  float64
 4   actor1_id     55082 non-null  float64
 5   actor2_id     53460 non-null  float64
 6   actor3_id     52462 non-null  float64
 7   actor4_id     51496 non-null  float64
 8   actor5_id     49761 non-null  float64
 9   director1_id  56705 non-null  float64
 10  director2_id  4007 non-null   float64
 11  director3_id  543 non-null    float64
 12  director4_id  240 non-null    float64
 13  director5_id  164 non-null    float64
 14  writer1_id    48257 non-null  float64
 15  writer2_id    20663 non-null  float64
 16  writer3_id    6130 non-null   float64
 17  writer4_id    1838 non-null   float64
 18  writer5_id    586 non-null

### Add these columns (obtained from a "movie" query): 
####    'release_date', 'runtime', 'vote_average', 'vote_count'

In [181]:
## Query TMDB for the above details of every movie

responses = []
i = 0
total_movies = 0
full_count = len(movies['tmdbId'].unique())

start = time.time()

for movie in movies['tmdbId'].unique():
    response = requests.get('https://api.themoviedb.org/3/movie/' + str(int(movie)) + '?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('details.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total_movies += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total movies out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total_movies, 
                  full_count, 
                  round(100*total_movies/full_count, 1)
              )
             )

with open('details.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(str(round((end - start)/60, 1)) + ' minutes total run time.')

3.1 minutes so far (1000 total movies out of 57160 queried: ~1.7% done).
5.9 minutes so far (2000 total movies out of 57160 queried: ~3.5% done).
9.0 minutes so far (3000 total movies out of 57160 queried: ~5.2% done).
12.0 minutes so far (4000 total movies out of 57160 queried: ~7.0% done).
15.0 minutes so far (5000 total movies out of 57160 queried: ~8.7% done).
18.0 minutes so far (6000 total movies out of 57160 queried: ~10.5% done).
21.1 minutes so far (7000 total movies out of 57160 queried: ~12.2% done).
24.1 minutes so far (8000 total movies out of 57160 queried: ~14.0% done).
27.2 minutes so far (9000 total movies out of 57160 queried: ~15.7% done).
30.2 minutes so far (10000 total movies out of 57160 queried: ~17.5% done).
33.2 minutes so far (11000 total movies out of 57160 queried: ~19.2% done).
36.2 minutes so far (12000 total movies out of 57160 queried: ~21.0% done).
39.2 minutes so far (13000 total movies out of 57160 queried: ~22.7% done).
42.3 minutes so far (14000 to

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [182]:
print(str(round((end - start)/60, 1)) + ' minutes total run time.')

178.8 minutes total run time.


In [52]:
# Note: the file as created above (by appending every 1000 movies to the file) has the unintended
#       effect of closing the list and starting a new list (i.e. putting '][' into the file where ','
#       would have been appropriate; so, you must find and replace on those patterns:
#       i.e., replace '][' with ',' using SublimeText)

temp = pd.read_json('details.json')
temp.shape

(57160, 25)

In [53]:
temp.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/XLwjO1NSCIaLznh58OQtmSFl0N.jpg,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",122.95,/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg,"[{'id': 3, 'logo_path': '/1TjvGVDMYsj6JBxOAkUH...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Toy Story,False,7.9,14111
1,False,/6w31RRm2s2CK1r3xDLf12WgIaHa.jpg,"{'id': 495527, 'name': 'Jumanji Collection', '...",65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",http://www.sonypictures.com/movies/jumanji/,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,19.955,/vgpXmVaVyUL7GGiDeiK1mKEKzcX.jpg,"[{'id': 559, 'logo_path': '/eC0bWHVjnjUducyA6Y...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Roll the dice and unleash the excitement!,Jumanji,False,7.2,8260
2,False,/nh9gYaXHTNT9yylX10L9aGqFehy.jpg,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,6.629,/1FSXpj5e8l4KH6nVFO5SPUeraOt.jpg,"[{'id': 19464, 'logo_path': None, 'name': 'Lan...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,251
3,False,/jZjoEKXMTDoZAGdkjhAdJaKtXSN.jpg,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",6.06,/4wjGMwPsdlvi025ZqR4rXnFDvBz.jpg,"[{'id': 25, 'logo_path': '/qZCc1lty5FzX30aOCVR...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156,127.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.3,95
4,False,/1XUPR3Ki1fvZDCtetcepMoz7oqu.jpg,"{'id': 96871, 'name': 'Father of the Bride (St...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.598,/rj4LBtwQ0uGrpBnCELr716Qo3mw.jpg,"[{'id': 9195, 'logo_path': '/ou5BUbtulr6tIt699...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-08,76594107,106.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,6.2,495


In [54]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  57160 non-null  bool   
 1   backdrop_path          41470 non-null  object 
 2   belongs_to_collection  6020 non-null   object 
 3   budget                 57160 non-null  int64  
 4   genres                 57160 non-null  object 
 5   homepage               56526 non-null  object 
 6   id                     57160 non-null  int64  
 7   imdb_id                57160 non-null  object 
 8   original_language      57160 non-null  object 
 9   original_title         57160 non-null  object 
 10  overview               57160 non-null  object 
 11  popularity             57160 non-null  float64
 12  poster_path            54183 non-null  object 
 13  production_companies   57160 non-null  object 
 14  production_countries   57160 non-null  object 
 15  re

In [55]:
temp = temp.drop(columns=['adult', 'backdrop_path', 'belongs_to_collection', 'homepage',
                          'original_title', 'overview', 'poster_path', 'production_companies',
                          'production_countries', 'spoken_languages', 'status', 'tagline',
                          'title', 'video', 'genres', 'original_language', 'popularity'])

In [56]:
# Set budget and revenue values listed as zero to NaN

temp.loc[temp['budget'] == 0, 'budget'] = np.nan
temp.loc[temp['revenue'] == 0, 'revenue'] = np.nan

In [57]:
# Note that runtime is missing for a handful of movies, but that's okay

temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        11241 non-null  float64
 1   id            57160 non-null  int64  
 2   imdb_id       57160 non-null  object 
 3   release_date  57160 non-null  object 
 4   revenue       9766 non-null   float64
 5   runtime       57072 non-null  float64
 6   vote_average  57160 non-null  float64
 7   vote_count    57160 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 3.5+ MB


In [58]:
# Yikes! Budget and/or revenue are missing for a lot of movies! Better just drop these

temp = temp.drop(columns=['budget', 'revenue'])
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            57160 non-null  int64  
 1   imdb_id       57160 non-null  object 
 2   release_date  57160 non-null  object 
 3   runtime       57072 non-null  float64
 4   vote_average  57160 non-null  float64
 5   vote_count    57160 non-null  int64  
dtypes: float64(2), int64(2), object(2)
memory usage: 2.6+ MB


In [59]:
temp.head()

Unnamed: 0,id,imdb_id,release_date,runtime,vote_average,vote_count
0,862,tt0114709,1995-10-30,81.0,7.9,14111
1,8844,tt0113497,1995-12-15,104.0,7.2,8260
2,15602,tt0113228,1995-12-22,101.0,6.5,251
3,31357,tt0114885,1995-12-22,127.0,6.3,95
4,11862,tt0113041,1995-12-08,106.0,6.2,495


In [60]:
# Join the temp data with the movies dataframe

movies = movies.merge(temp, how='left', left_on='tmdbId', right_on='id')
movies = movies.drop(columns='id')

In [61]:
movies.rename(columns={'vote_average': 'tmdb_rating_avg', 'vote_count': 'tmdb_votes_tot'}, inplace=True)
movies.head()

Unnamed: 0,movieId,title,genres,tmdbId,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id,director1_id,director2_id,director3_id,director4_id,director5_id,writer1_id,writer2_id,writer3_id,writer4_id,writer5_id,imdb_id,release_date,runtime,tmdb_rating_avg,tmdb_votes_tot
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0,31.0,12898.0,7167.0,12899.0,12900.0,7879.0,,,,,7.0,12891.0,12892.0,12893.0,,tt0114709,1995-10-30,81.0,7.9,14111
1,2,Jumanji (1995),Adventure|Children|Fantasy,8844.0,2157.0,205.0,145151.0,5149.0,8537.0,4945.0,,,,,876.0,56520.0,56521.0,,,tt0113497,1995-12-15,104.0,7.2,8260
2,3,Grumpier Old Men (1995),Comedy|Romance,15602.0,6837.0,3151.0,13567.0,16757.0,589.0,26502.0,,,,,16837.0,,,,,tt0113228,1995-12-22,101.0,6.5,251
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0,8851.0,9780.0,18284.0,51359.0,66804.0,2178.0,,,,,5144.0,111118.0,,,,tt0114885,1995-12-22,127.0,6.3,95
4,5,Father of the Bride Part II (1995),Comedy,11862.0,67773.0,3092.0,519.0,70696.0,59222.0,56106.0,,,,,17698.0,26160.0,,,,tt0113041,1995-12-08,106.0,6.2,495


In [138]:
movies['release_date'] = pd.to_datetime(movies['release_date'], yearfirst=True, , errors = 'coerce')

In [219]:
# Now rename and reorder columns to match the specified schema

movies.rename(columns={'movieId': 'movie_id', 'tmdbId': 'tmdb_id'}, inplace=True)
movies = movies[['movie_id', 'title', 'tmdb_id', 'imdb_id', 'genres', 'actor1_id', 'actor2_id', \
                 'actor3_id', 'actor4_id', 'actor5_id', 'director1_id', 'director2_id', \
                 'director3_id', 'director4_id', 'director5_id', 'writer1_id', 'writer2_id', \
                 'writer3_id', 'writer4_id', 'writer5_id', 'release_date', 'runtime', \
                 'tmdb_rating_avg', 'tmdb_votes_tot']]
movies.head()

Unnamed: 0,movie_id,title,tmdb_id,imdb_id,genres,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id,director1_id,director2_id,director3_id,director4_id,director5_id,writer1_id,writer2_id,writer3_id,writer4_id,writer5_id,release_date,runtime,tmdb_rating_avg,tmdb_votes_tot
0,1,Toy Story (1995),862.0,tt0114709,Adventure|Animation|Children|Comedy|Fantasy,31.0,12898.0,7167.0,12899.0,12900.0,7879.0,,,,,7.0,12891.0,12892.0,12893.0,,1995-10-30,81.0,7.9,14111
1,2,Jumanji (1995),8844.0,tt0113497,Adventure|Children|Fantasy,2157.0,205.0,145151.0,5149.0,8537.0,4945.0,,,,,876.0,56520.0,56521.0,,,1995-12-15,104.0,7.2,8260
2,3,Grumpier Old Men (1995),15602.0,tt0113228,Comedy|Romance,6837.0,3151.0,13567.0,16757.0,589.0,26502.0,,,,,16837.0,,,,,1995-12-22,101.0,6.5,251
3,4,Waiting to Exhale (1995),31357.0,tt0114885,Comedy|Drama|Romance,8851.0,9780.0,18284.0,51359.0,66804.0,2178.0,,,,,5144.0,111118.0,,,,1995-12-22,127.0,6.3,95
4,5,Father of the Bride Part II (1995),11862.0,tt0113041,Comedy,67773.0,3092.0,519.0,70696.0,59222.0,56106.0,,,,,17698.0,26160.0,,,,1995-12-08,106.0,6.2,495


In [139]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57160 entries, 0 to 57159
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   movieId          57160 non-null  int64         
 1   title            57160 non-null  object        
 2   genres           57160 non-null  object        
 3   tmdbId           57160 non-null  float64       
 4   actor1_id        55082 non-null  float64       
 5   actor2_id        53460 non-null  float64       
 6   actor3_id        52462 non-null  float64       
 7   actor4_id        51496 non-null  float64       
 8   actor5_id        49761 non-null  float64       
 9   director1_id     56705 non-null  float64       
 10  director2_id     4007 non-null   float64       
 11  director3_id     543 non-null    float64       
 12  director4_id     240 non-null    float64       
 13  director5_id     164 non-null    float64       
 14  writer1_id       48257 non-null  float

### Now query TMDB for all actor/director/writer id's in the movies dataframe to create actor/director/writer dfs:

In [63]:
actors = movies[['actor1_id','actor2_id','actor3_id','actor4_id','actor5_id']].values.flatten()
print(actors.shape)
actors = actors[~np.isnan(actors)]
print(actors.shape)
actors = np.unique(actors)
print(actors.shape)

(285800,)
(262261,)
(93345,)


In [64]:
directors = movies[['director1_id','director2_id','director3_id','director4_id','director5_id']].values.flatten()
print(directors.shape)
directors = directors[~np.isnan(directors)]
print(directors.shape)
directors = np.unique(directors)
print(directors.shape)

(285800,)
(61659,)
(24223,)


In [65]:
writers = movies[['writer1_id','writer2_id','writer3_id','writer4_id','writer5_id']].values.flatten()
print(writers.shape)
writers = writers[~np.isnan(writers)]
print(writers.shape)
writers = np.unique(writers)
print(writers.shape)

(285800,)
(77474,)
(35585,)


In [547]:
# Start with all the actors

responses = []
i = 0
total = 0
full_count = len(actors)

start = time.time()

for actor in actors: 
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(actor)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('actors.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total pepole out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total, 
                  full_count, 
                  round(100*total/full_count, 1)
              )
             )

with open('actors.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(str(round((end - start)/60, 1)) + ' minutes total run time.')    

1.7 minutes so far (1000 total pepole out of 93345 queried: ~1.1% done).
4.8 minutes so far (2000 total pepole out of 93345 queried: ~2.1% done).
8.0 minutes so far (3000 total pepole out of 93345 queried: ~3.2% done).
11.4 minutes so far (4000 total pepole out of 93345 queried: ~4.3% done).
14.8 minutes so far (5000 total pepole out of 93345 queried: ~5.4% done).
18.1 minutes so far (6000 total pepole out of 93345 queried: ~6.4% done).
21.4 minutes so far (7000 total pepole out of 93345 queried: ~7.5% done).
24.5 minutes so far (8000 total pepole out of 93345 queried: ~8.6% done).
27.5 minutes so far (9000 total pepole out of 93345 queried: ~9.6% done).
30.7 minutes so far (10000 total pepole out of 93345 queried: ~10.7% done).
33.5 minutes so far (11000 total pepole out of 93345 queried: ~11.8% done).
36.5 minutes so far (12000 total pepole out of 93345 queried: ~12.9% done).
39.6 minutes so far (13000 total pepole out of 93345 queried: ~13.9% done).
42.6 minutes so far (14000 total 

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [549]:
# Picking up the actors (after the crash above) right after the first 89,000

responses = []
i = 0
total = 0
full_count = len(actors[89000:])

start = time.time()

for actor in actors[89000:]: 
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(actor)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('actors.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total pepole out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total, 
                  full_count, 
                  round(100*total/full_count, 1)
              )
             )

with open('actors.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(str(round((end - start)/60, 1)) + ' minutes total run time.')    

2.4 minutes so far (1000 total pepole out of 4345 queried: ~23.0% done).
6.7 minutes so far (2000 total pepole out of 4345 queried: ~46.0% done).
10.9 minutes so far (3000 total pepole out of 4345 queried: ~69.0% done).
14.8 minutes so far (4000 total pepole out of 4345 queried: ~92.1% done).
16.2 minutes total run time.


In [66]:
actors_df = pd.read_json('actors.json')
print(actors_df.shape)
actors_df.head()

(93345, 17)


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path,success,status_code,status_message
0,0.0,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2.0,,1.0,nm0000184,Directing,George Lucas,"Modesto, California, USA",8.257,/WCSZzWdtPmdRxH9LUCVi2JPCSJ.jpg,,,
1,0.0,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2.0,,2.0,nm0000434,Acting,Mark Hamill,"Concord, California, USA",8.961,/zMQ93JTLW8KxusKhOlHFZhih3YQ.jpg,,,
2,0.0,"[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...",Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2.0,,3.0,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",8.641,/5M7oN3sznp99hWYQ9sX0xheswWX.jpg,,,
3,0.0,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",Carrie Frances Fisher (21 October 1956 - 27 De...,1956-10-21,2016-12-27,1.0,https://carriefisher.com/,4.0,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",3.679,/rfJtncHewKVnHjqpIZvjn24ESeC.jpg,,,
4,0.0,[Peter Wilton Cushing],"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",1913-05-26,1994-08-11,2.0,,5.0,nm0001088,Acting,Peter Cushing,"Kenley, Surrey, England, UK",5.95,/if5g03wn6uvHx7F6FxXHLebKc0q.jpg,,,


In [67]:
print(actors_df[actors_df['success'].notnull()].shape)
actors_df[actors_df['success'].notnull()]

(2, 17)


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path,success,status_code,status_message
41672,,,,,,,,,,,,,,,0.0,34.0,The resource you requested could not be found.
57022,,,,,,,,,,,,,,,0.0,34.0,The resource you requested could not be found.


In [68]:
## What's going on with those two rows? Find out what the associated actor_ids are:

print(actors[41672])
print(actors[57022])

189129.0
1035856.0


In [69]:
# Let's try re-querying these two:

for actor in ['189129', '1035856']:
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(actor)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    print(response.json())

{'success': False, 'status_code': 34, 'status_message': 'The resource you requested could not be found.'}
{'success': False, 'status_code': 34, 'status_message': 'The resource you requested could not be found.'}


In [76]:
# Okay, looks like these two actors don't have any representation in the database
#    Let's delete them from the movies dataframe:

movies.where(movies == 189129).dropna(how='all').dropna(axis=1)

37806
55349


In [80]:
# This is an actor; looking this movie up on TMDB, we can substitute another actor in the cast (147518) for 189129

print(movies.iloc[37806])
movies.iloc[37806, 5] = 147518.0
print(movies.iloc[37806])

movieId                               149330
title              A Cosmic Christmas (1977)
genres                    (no genres listed)
tmdbId                              125464.0
actor1_id                          1081416.0
actor2_id                           189129.0
actor3_id                           209380.0
actor4_id                            94492.0
actor5_id                          1081420.0
director1_id                         68738.0
director2_id                             NaN
director3_id                             NaN
director4_id                             NaN
director5_id                             NaN
writer1_id                               NaN
writer2_id                               NaN
writer3_id                               NaN
writer4_id                               NaN
writer5_id                               NaN
imdb_id                            tt0182015
release_date                      1977-12-04
runtime                                 26.0
tmdb_ratin

In [78]:
# So this is a movieId, not an actor_id, so nothing to fix here

movies.iloc[55349]

movieId                            189129
title              What We Started (2018)
genres                        Documentary
tmdbId                           461118.0
actor1_id                       1569717.0
actor2_id                       1443606.0
actor3_id                         89056.0
actor4_id                         57108.0
actor5_id                        998387.0
director1_id                     968708.0
director2_id                    1830702.0
director3_id                          NaN
director4_id                          NaN
director5_id                          NaN
writer1_id                            NaN
writer2_id                            NaN
writer3_id                            NaN
writer4_id                            NaN
writer5_id                            NaN
imdb_id                         tt6191876
release_date                   2018-03-23
runtime                              70.0
tmdb_rating_avg                       7.2
tmdb_votes_tot                    

In [81]:
movies.where(movies == 1035856).dropna(how='all').dropna(axis=1)

Unnamed: 0,actor5_id
50604,1035856.0


In [85]:
# Looking this movie up on TMDB, we can substitute another actor in the cast (133113) for 1035856

print(movies.iloc[50604, 8])
movies.iloc[50604, 8] = 133113.0
print(movies.iloc[50604, 8])

1035856.0
133113.0


In [90]:
# Check to see if the 2 actors we just subsituted in are already in actors_df:

print(147518.0 in actors_df['id'])
print(133113.0 in actors_df['id'])

False
False


In [91]:
# Okay, so we need to query these two actors and add them to actors_df

for actor in ['147518', '133113']: 
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(actor)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    with open('actors.json', 'a') as outfile:
        json.dump(response.json(), outfile, indent=2)

In [92]:
actors_df = pd.read_json('actors.json')
print(actors_df.shape)
actors_df.head()

(93347, 17)


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path,success,status_code,status_message
0,0.0,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2.0,,1.0,nm0000184,Directing,George Lucas,"Modesto, California, USA",8.257,/WCSZzWdtPmdRxH9LUCVi2JPCSJ.jpg,,,
1,0.0,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2.0,,2.0,nm0000434,Acting,Mark Hamill,"Concord, California, USA",8.961,/zMQ93JTLW8KxusKhOlHFZhih3YQ.jpg,,,
2,0.0,"[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...",Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2.0,,3.0,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",8.641,/5M7oN3sznp99hWYQ9sX0xheswWX.jpg,,,
3,0.0,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",Carrie Frances Fisher (21 October 1956 - 27 De...,1956-10-21,2016-12-27,1.0,https://carriefisher.com/,4.0,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",3.679,/rfJtncHewKVnHjqpIZvjn24ESeC.jpg,,,
4,0.0,[Peter Wilton Cushing],"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",1913-05-26,1994-08-11,2.0,,5.0,nm0001088,Acting,Peter Cushing,"Kenley, Surrey, England, UK",5.95,/if5g03wn6uvHx7F6FxXHLebKc0q.jpg,,,


In [95]:
actors_df.drop(columns=['success', 'status_code', 'status_message'], inplace=True)
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93347 entries, 0 to 93346
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 93345 non-null  float64
 1   also_known_as         93345 non-null  object 
 2   biography             93345 non-null  object 
 3   birthday              49720 non-null  object 
 4   deathday              14016 non-null  object 
 5   gender                93345 non-null  float64
 6   homepage              6407 non-null   object 
 7   id                    93345 non-null  float64
 8   imdb_id               81166 non-null  object 
 9   known_for_department  93345 non-null  object 
 10  name                  93345 non-null  object 
 11  place_of_birth        45133 non-null  object 
 12  popularity            93345 non-null  float64
 13  profile_path          53968 non-null  object 
dtypes: float64(4), object(10)
memory usage: 10.0+ MB


In [104]:
# Let's drop all but 'birthday', 'deathday', 'id', 'imdb_id', 'name'

actors_df.drop(columns=['adult', 'also_known_as', 'biography', 'gender', 'homepage',
                        'known_for_department', 'place_of_birth', 'popularity', 'profile_path'],
              inplace=True)
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93347 entries, 0 to 93346
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   birthday  49720 non-null  object 
 1   deathday  14016 non-null  object 
 2   id        93345 non-null  float64
 3   imdb_id   81166 non-null  object 
 4   name      93345 non-null  object 
dtypes: float64(1), object(4)
memory usage: 3.6+ MB


In [140]:
actors_df.head()

Unnamed: 0,birthday,deathday,id,imdb_id,name
0,1944-05-14,,1.0,nm0000184,George Lucas
1,1951-09-25,,2.0,nm0000434,Mark Hamill
2,1942-07-13,,3.0,nm0000148,Harrison Ford
3,1956-10-21,2016-12-27,4.0,nm0000402,Carrie Fisher
4,1913-05-26,1994-08-11,5.0,nm0001088,Peter Cushing


In [145]:
actors_df['birthday'] = pd.to_datetime(actors_df['birthday'], yearfirst=True, errors = 'coerce')
actors_df['deathday'] = pd.to_datetime(actors_df['deathday'], yearfirst=True, errors = 'coerce')
actors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93347 entries, 0 to 93346
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   birthday  49718 non-null  datetime64[ns]
 1   deathday  14013 non-null  datetime64[ns]
 2   id        93345 non-null  float64       
 3   imdb_id   81166 non-null  object        
 4   name      93345 non-null  object        
dtypes: datetime64[ns](2), float64(1), object(2)
memory usage: 3.6+ MB


In [214]:
# Finally, rename the column ids & reorder to match the defined schema

actors_df.rename(columns={'id': 'actor_id'}, inplace=True)
actors_df = actors_df[['actor_id', 'name', 'imdb_id', 'birthday', 'deathday']]
actors_df.head()

Unnamed: 0,actor_id,name,imdb_id,birthday,deathday
0,1.0,George Lucas,nm0000184,1944-05-14,NaT
1,2.0,Mark Hamill,nm0000434,1951-09-25,NaT
2,3.0,Harrison Ford,nm0000148,1942-07-13,NaT
3,4.0,Carrie Fisher,nm0000402,1956-10-21,2016-12-27
4,5.0,Peter Cushing,nm0001088,1913-05-26,1994-08-11


In [550]:
# Now all the directors

responses = []
i = 0
total = 0
full_count = len(directors)

start = time.time()

for director in directors: 
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(director)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('directors.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total people out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total, 
                  full_count, 
                  round(100*total/full_count, 1)
              )
             )

with open('directors.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(str(round((end - start)/60, 1)) + ' minutes total run time.')    

2.2 minutes so far (1000 total people out of 24223 queried: ~4.1% done).
4.7 minutes so far (2000 total people out of 24223 queried: ~8.3% done).
7.4 minutes so far (3000 total people out of 24223 queried: ~12.4% done).
10.1 minutes so far (4000 total people out of 24223 queried: ~16.5% done).
12.8 minutes so far (5000 total people out of 24223 queried: ~20.6% done).
15.6 minutes so far (6000 total people out of 24223 queried: ~24.8% done).
18.3 minutes so far (7000 total people out of 24223 queried: ~28.9% done).
21.1 minutes so far (8000 total people out of 24223 queried: ~33.0% done).
23.9 minutes so far (9000 total people out of 24223 queried: ~37.2% done).
26.6 minutes so far (10000 total people out of 24223 queried: ~41.3% done).
29.4 minutes so far (11000 total people out of 24223 queried: ~45.4% done).
32.3 minutes so far (12000 total people out of 24223 queried: ~49.5% done).
35.1 minutes so far (13000 total people out of 24223 queried: ~53.7% done).
38.0 minutes so far (14000

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [551]:
# Picking up the directors (after the crash above) right after the first 20,000

responses = []
i = 0
total = 0
full_count = len(directors[20000:])

start = time.time()

for director in directors[20000:]: 
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(director)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('directors.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total people out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total, 
                  full_count, 
                  round(100*total/full_count, 1)
              )
             )

with open('directors.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(str(round((end - start)/60, 1)) + ' minutes total run time.')    

1.9 minutes so far (1000 total people out of 4223 queried: ~23.7% done).
5.0 minutes so far (2000 total people out of 4223 queried: ~47.4% done).
8.1 minutes so far (3000 total people out of 4223 queried: ~71.0% done).
11.2 minutes so far (4000 total people out of 4223 queried: ~94.7% done).
11.9 minutes total run time.


In [105]:
directors_df = pd.read_json('directors.json')
print(directors_df.shape)
directors_df.head()

(24223, 14)


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2,,1,nm0000184,Directing,George Lucas,"Modesto, California, USA",8.257,/WCSZzWdtPmdRxH9LUCVi2JPCSJ.jpg
1,False,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2,,2,nm0000434,Acting,Mark Hamill,"Concord, California, USA",8.961,/zMQ93JTLW8KxusKhOlHFZhih3YQ.jpg
2,False,[Andrew A. Stanton],"Andrew Stanton (born December 3, 1965) is an A...",1965-12-03,,2,,7,nm0004056,Writing,Andrew Stanton,"Boston, Massachusetts, USA",5.12,/tRwWuo06aN9vuXAPaswMN42x2ii.jpg
3,False,[],Lee Unkrich is an American director and film e...,1967-08-08,,2,,8,nm0881279,Directing,Lee Unkrich,"Cleveland, Ohio, USA",1.4,/oeUkLlak2lqKRYZmJZZv5gI87Ok.jpg
4,False,[],Albert Lawrence Brooks (born Albert Lawrence E...,1947-07-22,,2,http://www.albertbrooks.com/,13,nm0000983,Acting,Albert Brooks,"Beverly Hills, California, USA",3.665,/8iDSGu5l93N7benjf6b3AysBore.jpg


In [107]:
print(len(directors))
directors_df.info()

24223
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24223 entries, 0 to 24222
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 24223 non-null  bool   
 1   also_known_as         24223 non-null  object 
 2   biography             24223 non-null  object 
 3   birthday              10473 non-null  object 
 4   deathday              2865 non-null   object 
 5   gender                24223 non-null  int64  
 6   homepage              1306 non-null   object 
 7   id                    24223 non-null  int64  
 8   imdb_id               21673 non-null  object 
 9   known_for_department  24223 non-null  object 
 10  name                  24223 non-null  object 
 11  place_of_birth        9890 non-null   object 
 12  popularity            24223 non-null  float64
 13  profile_path          8599 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(10)
memory usage: 2.4+ 

In [108]:
# Let's drop all but 'birthday', 'deathday', 'id', 'imdb_id', 'name'

directors_df.drop(columns=['adult', 'also_known_as', 'biography', 'gender', 'homepage',
                        'known_for_department', 'place_of_birth', 'popularity', 'profile_path'],
              inplace=True)
directors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24223 entries, 0 to 24222
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   birthday  10473 non-null  object
 1   deathday  2865 non-null   object
 2   id        24223 non-null  int64 
 3   imdb_id   21673 non-null  object
 4   name      24223 non-null  object
dtypes: int64(1), object(4)
memory usage: 946.3+ KB


In [148]:
directors_df.head()

Unnamed: 0,birthday,deathday,id,imdb_id,name
0,1944-05-14,,1,nm0000184,George Lucas
1,1951-09-25,,2,nm0000434,Mark Hamill
2,1965-12-03,,7,nm0004056,Andrew Stanton
3,1967-08-08,,8,nm0881279,Lee Unkrich
4,1947-07-22,,13,nm0000983,Albert Brooks


In [149]:
directors_df['birthday'] = pd.to_datetime(directors_df['birthday'], yearfirst=True, errors = 'coerce')
directors_df['deathday'] = pd.to_datetime(directors_df['deathday'], yearfirst=True, errors = 'coerce')
directors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24223 entries, 0 to 24222
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   birthday  10473 non-null  datetime64[ns]
 1   deathday  2863 non-null   datetime64[ns]
 2   id        24223 non-null  int64         
 3   imdb_id   21673 non-null  object        
 4   name      24223 non-null  object        
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 946.3+ KB


In [211]:
# Finally, rename the column ids & reorder to match the defined schema

directors_df.rename(columns={'id': 'director_id'}, inplace=True)
directors_df = directors_df[['director_id', 'name', 'imdb_id', 'birthday', 'deathday']]
directors_df.head()

Unnamed: 0,director_id,name,imdb_id,birthday,deathday
0,1,George Lucas,nm0000184,1944-05-14,NaT
1,2,Mark Hamill,nm0000434,1951-09-25,NaT
2,7,Andrew Stanton,nm0004056,1965-12-03,NaT
3,8,Lee Unkrich,nm0881279,1967-08-08,NaT
4,13,Albert Brooks,nm0000983,1947-07-22,NaT


In [552]:
# Finally, all the writers

responses = []
i = 0
total = 0
full_count = len(writers)

start = time.time()

for writer in writers: 
    response = requests.get('https://api.themoviedb.org/3/person/' + str(int(writer)) +'?api_key=' +  os.environ['TMDB_API_KEY'] + '&language=en-US')
    responses.append(response.json())
    i += 1
    if i == 1000:
        with open('writers.json', 'a') as outfile:
            json.dump(responses, outfile, indent=2)
        responses = []
        total += i
        i = 0
        so_far = time.time()
        print('{} minutes so far ({} total movies out of {} queried: ~{}% done).'
              .format(
                  round((so_far - start)/60, 1), 
                  total, 
                  full_count, 
                  round(100*total/full_count, 1)
              )
             )

with open('writers.json', 'a') as outfile:
    json.dump(responses, outfile, indent=2)

end = time.time()
print(str(round((end - start)/60, 1)) + ' minutes total run time.')    

3.0 minutes so far (1000 total movies out of 35585 queried: ~2.8% done).
6.4 minutes so far (2000 total movies out of 35585 queried: ~5.6% done).
9.3 minutes so far (3000 total movies out of 35585 queried: ~8.4% done).
12.0 minutes so far (4000 total movies out of 35585 queried: ~11.2% done).
14.7 minutes so far (5000 total movies out of 35585 queried: ~14.1% done).
17.8 minutes so far (6000 total movies out of 35585 queried: ~16.9% done).
20.8 minutes so far (7000 total movies out of 35585 queried: ~19.7% done).
23.8 minutes so far (8000 total movies out of 35585 queried: ~22.5% done).
26.8 minutes so far (9000 total movies out of 35585 queried: ~25.3% done).
29.6 minutes so far (10000 total movies out of 35585 queried: ~28.1% done).
32.8 minutes so far (11000 total movies out of 35585 queried: ~30.9% done).
35.7 minutes so far (12000 total movies out of 35585 queried: ~33.7% done).
38.7 minutes so far (13000 total movies out of 35585 queried: ~36.5% done).
41.7 minutes so far (14000 

In [116]:
writers_df = pd.read_json('writers.json')
print(writers_df.shape)
writers_df.head()

(35585, 17)


Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path,success,status_code,status_message
0,0.0,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2.0,,1.0,nm0000184,Directing,George Lucas,"Modesto, California, USA",7.677,/WCSZzWdtPmdRxH9LUCVi2JPCSJ.jpg,,,
1,0.0,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",Carrie Frances Fisher (21 October 1956 - 27 De...,1956-10-21,2016-12-27,1.0,https://carriefisher.com/,4.0,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",3.679,/rfJtncHewKVnHjqpIZvjn24ESeC.jpg,,,
2,0.0,[Andrew A. Stanton],"Andrew Stanton (born December 3, 1965) is an A...",1965-12-03,,2.0,,7.0,nm0004056,Writing,Andrew Stanton,"Boston, Massachusetts, USA",5.12,/tRwWuo06aN9vuXAPaswMN42x2ii.jpg,,,
3,0.0,[],"Robert ""Bob"" Peterson (born January 1961) is a...",1961-01-18,,2.0,,10.0,nm0677037,Acting,Bob Peterson,"Wooster, Ohio, USA",0.84,/1D5PtC97QwIks6xTjbJ1HNE8kbT.jpg,,,
4,0.0,"[Dave Reynolds, David F. Reynolds]",Dave began his writing career in 1993 as one o...,1966-08-10,,2.0,,11.0,nm0721675,Writing,David Reynolds,,1.176,/5iKtATPbLpv2lT7q9DPX2v2qPS1.jpg,,,


In [117]:
writers_df[writers_df['success'].notnull()]

Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path,success,status_code,status_message
23126,,,,,,,,,,,,,,,0.0,34.0,The resource you requested could not be found.


In [118]:
# This writer is the one that got back the "not in the database" response

writers[23126]

1128385.0

In [119]:
movies.where(movies == 1128385).dropna(how='all').dropna(axis=1)

Unnamed: 0,movieId,title,genres,tmdbId,actor1_id,actor2_id,actor3_id,actor4_id,actor5_id,director1_id,director2_id,director3_id,director4_id,director5_id,writer1_id,writer2_id,writer3_id,writer4_id,writer5_id,imdb_id,release_date,runtime,tmdb_rating_avg,tmdb_votes_tot


In [113]:
# Looking this movie up on TMDB shows only one writer, so let's just drop the second writer

print(movies.iloc[2748, 15])
movies.iloc[2748, 15] = np.nan
print(movies.iloc[2748, 15])

1128385.0
nan


In [120]:
writers_df.drop(columns=['success', 'status_code', 'status_message'], inplace=True)
writers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35585 entries, 0 to 35584
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   adult                 35584 non-null  float64
 1   also_known_as         35584 non-null  object 
 2   biography             35584 non-null  object 
 3   birthday              12313 non-null  object 
 4   deathday              4009 non-null   object 
 5   gender                35584 non-null  float64
 6   homepage              1439 non-null   object 
 7   id                    35584 non-null  float64
 8   imdb_id               30490 non-null  object 
 9   known_for_department  35584 non-null  object 
 10  name                  35584 non-null  object 
 11  place_of_birth        11167 non-null  object 
 12  popularity            35584 non-null  float64
 13  profile_path          9504 non-null   object 
dtypes: float64(4), object(10)
memory usage: 3.8+ MB


In [121]:
# Let's drop all but 'birthday', 'deathday', 'id', 'imdb_id', 'name'

writers_df.drop(columns=['adult', 'also_known_as', 'biography', 'gender', 'homepage',
                        'known_for_department', 'place_of_birth', 'popularity', 'profile_path'],
              inplace=True)
writers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35585 entries, 0 to 35584
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   birthday  12313 non-null  object 
 1   deathday  4009 non-null   object 
 2   id        35584 non-null  float64
 3   imdb_id   30490 non-null  object 
 4   name      35584 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.4+ MB


In [122]:
# Makes sense; this list includes the one writer we dropped from writers_df above

len(writers)

35585

In [151]:
writers_df.head()

Unnamed: 0,birthday,deathday,id,imdb_id,name
0,1944-05-14,,1.0,nm0000184,George Lucas
1,1956-10-21,2016-12-27,4.0,nm0000402,Carrie Fisher
2,1965-12-03,,7.0,nm0004056,Andrew Stanton
3,1961-01-18,,10.0,nm0677037,Bob Peterson
4,1966-08-10,,11.0,nm0721675,David Reynolds


In [152]:
writers_df['birthday'] = pd.to_datetime(writers_df['birthday'], yearfirst=True, errors = 'coerce')
writers_df['deathday'] = pd.to_datetime(writers_df['deathday'], yearfirst=True, errors = 'coerce')
writers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35585 entries, 0 to 35584
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   birthday  12311 non-null  datetime64[ns]
 1   deathday  4006 non-null   datetime64[ns]
 2   id        35584 non-null  float64       
 3   imdb_id   30490 non-null  object        
 4   name      35584 non-null  object        
dtypes: datetime64[ns](2), float64(1), object(2)
memory usage: 1.4+ MB


In [213]:
# Finally, rename the column ids & reorder to match the defined schema

writers_df.rename(columns={'id': 'writer_id'}, inplace=True)
writers_df = writers_df[['writer_id', 'name', 'imdb_id', 'birthday', 'deathday']]
writers_df.head()

Unnamed: 0,writer_id,name,imdb_id,birthday,deathday
0,1.0,George Lucas,nm0000184,1944-05-14,NaT
1,4.0,Carrie Fisher,nm0000402,1956-10-21,2016-12-27
2,7.0,Andrew Stanton,nm0004056,1965-12-03,NaT
3,10.0,Bob Peterson,nm0677037,1961-01-18,NaT
4,11.0,David Reynolds,nm0721675,1966-08-10,NaT


In [127]:
# We can concatenate these 3 dataframes into one called 'creatives_df' (not sure if we should, or keep separate)

print(np.setdiff1d(actors_df.columns, directors_df.columns))
print(np.setdiff1d(directors_df.columns, writers_df.columns))

[]
[]


## 3) Now for User Ratings & Tags (which will become the Fact Table)

In [199]:
# Only include movies that are still in the movies dataframe and drop any rows with NaNs

print(ratings.shape)
ratings = ratings[ratings['movieId'].isin(movies['movieId'].unique())]
print(ratings.shape)

(27753444, 4)
(27674802, 4)


In [200]:
# Do the same for the tags

print(tags.shape)
tags = tags[tags['movieId'].isin(movies['movieId'].unique())]
print(tags.shape)

(1108981, 4)
(1102147, 4)


In [201]:
user_actions = ratings.copy(deep=True)
user_actions['action'] = 'rate'
user_actions['action_val'] = user_actions['rating'].apply(str)
user_actions.drop(columns=['rating'], inplace=True)
user_actions = user_actions[['userId', 'timestamp', 'movieId', 'action', 'action_val']]

In [202]:
user_actions2 = tags.copy(deep=True)
user_actions2['action'] = 'tag'
user_actions2.rename(columns={'tag': 'action_val'}, inplace=True)
user_actions2 = user_actions2[['userId', 'timestamp', 'movieId', 'action', 'action_val']]

In [203]:
# Now concatenate the rating and tagging actions dataframes

print(user_actions.shape)
print(user_actions2.shape)
user_actions = pd.concat([user_actions, user_actions2], axis=0)
user_actions.reset_index(drop=True, inplace=True)
print(user_actions.shape)
user_actions.head()

(27674802, 5)
(1102147, 5)
(28776949, 5)


Unnamed: 0,userId,timestamp,movieId,action,action_val
0,1,1256677221,307,rate,3.5
1,1,1256677456,481,rate,3.5
2,1,1256677471,1091,rate,1.5
3,1,1256677460,1257,rate,4.5
4,1,1256677264,1449,rate,4.5


In [205]:
# Now convert timestamps to datetimes

user_actions.loc[:, 'timestamp'] = \
                user_actions.loc[:, 'timestamp'] \
                            .apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))

In [206]:
print(user_actions.info())
user_actions.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28776949 entries, 0 to 28776948
Data columns (total 5 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   userId      int64 
 1   timestamp   object
 2   movieId     int64 
 3   action      object
 4   action_val  object
dtypes: int64(2), object(3)
memory usage: 1.1+ GB
None


Unnamed: 0,userId,timestamp,movieId,action,action_val
0,1,2009-10-27 14:00:21,307,rate,3.5
1,1,2009-10-27 14:04:16,481,rate,3.5
2,1,2009-10-27 14:04:31,1091,rate,1.5
3,1,2009-10-27 14:04:20,1257,rate,4.5
4,1,2009-10-27 14:01:04,1449,rate,4.5


In [207]:
user_actions.tail()

Unnamed: 0,userId,timestamp,movieId,action,action_val
28776944,283206,2010-01-24 16:24:19,73017,tag,fun
28776945,283206,2010-01-24 16:24:18,73017,tag,homoerotic subtext
28776946,283206,2010-01-24 16:24:18,73017,tag,pacing
28776947,283206,2010-01-24 16:24:18,73017,tag,plot
28776948,283221,2007-01-09 04:47:10,49651,tag,sylvester stallone


In [209]:
# Finally, rename the column ids to match the defined schema

user_actions.rename(columns={'userId': 'user_id', 'timestamp': 'action_time', 'movieId': 'movie_id'}, inplace=True)
user_actions.head()

Unnamed: 0,user_id,action_time,movie_id,action,action_val
0,1,2009-10-27 14:00:21,307,rate,3.5
1,1,2009-10-27 14:04:16,481,rate,3.5
2,1,2009-10-27 14:04:31,1091,rate,1.5
3,1,2009-10-27 14:04:20,1257,rate,4.5
4,1,2009-10-27 14:01:04,1449,rate,4.5


## 4) Create dataframe for the "user" dimension table

In [332]:
users_df = pd.DataFrame(data={'user_id': np.sort(user_actions['user_id'].unique())})
print(users_df.shape)
users_df

(283212, 1)


Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5
...,...
283207,283224
283208,283225
283209,283226
283210,283227


In [333]:
user_actions['rated'] = user_actions['action'].apply(lambda x: 1 if (x == 'rate') else 0)
user_actions['tagged'] = user_actions['action'].apply(lambda x: 1 if (x == 'tag') else 0)
user_actions['rating_val'] = user_actions[['action','action_val']] \
                            .apply(lambda x: np.nan if (x['action'] == 'tag') else float(x['action_val']),
                                  axis=1)

user_actions['num_ratings'] = user_actions.groupby('user_id')['rated'].transform('sum')
user_actions['num_tags'] = user_actions.groupby('user_id')['tagged'].transform('sum')
user_actions['avg_rating'] = user_actions.groupby('user_id')['rating_val'].transform(np.mean)

temp = user_actions.groupby('user_id').first().reset_index()

users_df['num_ratings'] = temp['num_ratings']
users_df['num_tags'] = temp['num_tags']
users_df['avg_rating'] = temp['avg_rating']

user_actions.drop(columns=['rated', 'tagged', 'num_ratings', 'num_tags', 'rating_val', 'avg_rating'], 
                  inplace=True)

In [334]:
user_actions.head()

Unnamed: 0,user_id,action_time,movie_id,action,action_val
0,1,2009-10-27 14:00:21,307,rate,3.5
1,1,2009-10-27 14:04:16,481,rate,3.5
2,1,2009-10-27 14:04:31,1091,rate,1.5
3,1,2009-10-27 14:04:20,1257,rate,4.5
4,1,2009-10-27 14:01:04,1449,rate,4.5


In [335]:
users_df

Unnamed: 0,user_id,num_ratings,num_tags,avg_rating
0,1,16,0,3.312500
1,2,15,0,3.666667
2,3,11,0,3.545455
3,4,732,0,3.391393
4,5,72,0,4.263889
...,...,...,...,...
283207,283224,329,0,3.732523
283208,283225,20,0,3.100000
283209,283226,11,0,1.818182
283210,283227,17,0,3.941176


In [336]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283212 entries, 0 to 283211
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      283212 non-null  int64  
 1   num_ratings  283212 non-null  int64  
 2   num_tags     283212 non-null  int64  
 3   avg_rating   283211 non-null  float64
dtypes: float64(1), int64(3)
memory usage: 8.6 MB


In [337]:
users_df[users_df.isnull().any(axis=1)]

Unnamed: 0,user_id,num_ratings,num_tags,avg_rating
74222,74231,0,4,


## 5) Let's add 'user_rating_avg', 'user_votes_tot' & 'unique_user_tags' to movies dataframe

In [338]:
movies.columns

Index(['movie_id', 'title', 'tmdb_id', 'imdb_id', 'genres', 'actor1_id',
       'actor2_id', 'actor3_id', 'actor4_id', 'actor5_id', 'director1_id',
       'director2_id', 'director3_id', 'director4_id', 'director5_id',
       'writer1_id', 'writer2_id', 'writer3_id', 'writer4_id', 'writer5_id',
       'release_date', 'runtime', 'tmdb_rating_avg', 'tmdb_votes_tot'],
      dtype='object')

In [353]:
unique_users = user_actions['user_id'].unique()
full_movie_list = movies['movie_id'].unique()
user_rated_movies = user_actions.loc[user_actions['action'] == 'rate', 'movie_id'].unique()
user_tagged_movies = user_actions.loc[user_actions['action'] == 'tag', 'movie_id'].unique()

avg_rating_list = []
user_votes_tot_list = []
unique_user_tags_list = []

for movie in full_movie_list:
    if movie in user_rated_movies:
        avg_rating = np.mean(user_actions.loc[(user_actions['movie_id'] == movie) \
                                              & (user_actions['action'] == 'rate'), \
                                              'action_val'].astype(float))
        avg_rating_list.append(avg_rating)
        user_votes_tot = user_actions.loc[(user_actions['movie_id'] == movie) \
                                          & (user_actions['action'] == 'rate'), 'action_val'].count()
        user_votes_tot_list.append(user_votes_tot)
    else:
        avg_rating_list.append(np.nan)
        user_votes_tot_list.append(0)
    if movie in user_tagged_movies:
        tag_count = user_actions.loc[(user_actions['movie_id'] == movie) \
                                     & (user_actions['action'] == 'tag'), 'action_val'].unique().shape[0]
        unique_user_tags_list.append(tag_count)
    else:
        unique_user_tags_list.append(0)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [None]:
df = pd.DataFrame(columns={'movie_id': full_movie_list, 
                           'user_rating_avg': avg_rating_list,
                           'user_votes_tot': user_votes_tot_list,
                           'unique_user_tags': unique_user_tags_list})

In [None]:
df.head()

In [None]:
df.to_csv('temp_df.csv')