In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import glob

import os, time, json
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook
import tmdbsimple as tmdb

# API Credentials

In [2]:
with open(r"C:\Users\nbeac\.secret\tmdb_api.json", 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [3]:
tmdb.API_KEY =  login['api-key']

# Querying Movies by ID

In [4]:
movie = tmdb.Movies(603)

In [5]:
# Movie objects have a .info dictionary
info = movie.info()
info

{'adult': False,
 'backdrop_path': '/y9wuhlrqSHvhTLNVNwKMKe6HZzY.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 66.758,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/tpFpsqbleCzEE2p5EgvUq6ozfCA.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 174,
   'logo_path': '/IuAlhI9eVC9Z8UQWOIDdWRKSEJ.png'

There is a lot of information here, however we are missing the certification information. 

In [6]:
info['budget']

63000000

In [7]:
info['revenue']

463517383

## Searching with IMDB_ID

In [8]:
info['imdb_id']

'tt0133093'

Try searching by the imdb number. 

In [9]:
movie = tmdb.Movies('tt1361336')
info = movie.info()
info['budget']

50000000

# Saving the movie Certification/MPAA Rating


In [10]:
response = movie.releases()
for c in movie.countries:
    if c['iso_3166_1'] == 'US':
        print(c['certification'])

PG
PG
PG


In [11]:
def get_movie_with_rating(movie_id):
    ## Get movie and release dates
    movie = tmdb.Movies(movie_id)
    ## Construct output dict
    movie_info = movie.info()
    releases = movie.releases()
    for c in releases['countries']:
        if c['iso_3166_1'] == 'US':
            movie_info['certification'] = c['certification']
    return movie_info

In [12]:
test = get_movie_with_rating('tt0848228')
test

{'adult': False,
 'backdrop_path': '/nNmJRkg8wWnRmzQDe2FwKbPIsJV.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 300.587,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

# BEFORE THE LOOPS

In [13]:
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['final_tmdb_data_2001.csv.gz', 'tmdb_api_results_2001.json']

## Define the Years

In [14]:
YEARS_TO_GET = [2000,2001]

# OUTER LOOP

Setting up the progress bar to keep track of our progress and ensure our calls are working. 

In [15]:
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS'):
    #Defining the JSON file to store results for year
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if file exists
    file_exists = os.path.isfile(JSON_FILE)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)

In [17]:
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv(r"C:\Users\nbeac\OneDrive\Documents\GitHub\IMDB-Project\Data\title_basics.csv.gz")
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0046415,tvSeries,The Count of Monte Cristo,Le comte de Monte-Cristo,0,1954,1954,183,"Adventure,Drama,Romance"
1,tt0071010,tvSeries,The Manhunter,The Manhunter,0,1974,1975,60,Drama
2,tt1286039,tvSeries,Stargate Universe,SG.U Stargate Universe,0,2009,2011,43,"Drama,Sci-Fi"
3,tt1772752,tvSeries,A.N.T. Farm,A.N.T. Farm,0,2011,2014,30,"Comedy,Drama,Family"
4,tt1826071,tvSeries,Adini Feriha Koydum,Adini Feriha Koydum,0,2011,2012,120,"Drama,Romance"
5,tt2224968,tvSeries,Jang Geum's Dream,Jang Geum ieui Kkum,0,2005,2007,30,"Adventure,Animation,Comedy"
6,tt3672132,tvSeries,Anali Ogullu,Anali Ogullu,0,2014,2014,80,Comedy
7,tt4875520,tvSeries,Iliski Durumu: Karisik,Iliski Durumu: Karisik,0,2015,2016,120,"Comedy,Romance"
8,tt8315348,tvSeries,4N1K,4N1K,0,2018,2019,100,"Comedy,Romance"
9,tt8390060,tvSeries,Her sey yolunda merkez,Her sey yolunda merkez,0,2013,2013,90,Comedy


In [18]:
#Saving new year as the current df
df = basics.loc[ basics['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()#.to_list()

In [19]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)

In [20]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

# Check for and remove any previously downloaded Movie id's

In [21]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)

In [22]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

# INNER Loop

In [23]:
def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

## Iterate through the list of Movie IDs and make the calls

In [27]:
#Get index and movie id from list
# INNER Loop
for movie_id in tqdm_notebook(movie_ids_to_get,
                          desc=f'Movies from {YEAR}',
                          position=0,
                          leave=True):
    # Attempt to retrieve the data for the movie_id
    try:
        temp = get_movie_with_rating(movie_id) # Uses the pre-made function
        # Append/Extend results to existing file by using the pre-made function
        write_json(temp,JSON_FILE)
        # short 20ms sleep to prevent overwhelming server
        time.sleep(0.02)
        
    # If it fails, make a dict with just the id and None for Certification
    except Exception as e:
        continue

Movies from 2001: 0it [00:00, ?it/s]

In [25]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

In [26]:
final_year_df

Unnamed: 0,imdb_id
0,0


# Exploratory Data Analysis