In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import json
from pprint import pprint

# Import API key
from api_keys import tmdb_api_key

In [2]:
#load the csv file from the Kaggle/imdb info
movie_data_df = pd.read_csv('data/movies_sorted_by_year.csv')
movie_data_df.head()


In [3]:
#To build an image URL, you will need 3 pieces of data. The base_url, 
#size and file_path. Simply combine them all and you will have a fully 
#qualified URL.

#create the empty movie name vairable and empty movie list
movie_name = ""
movie_list = []

#the movie list will pull in the column of Row+Labels for the search query
movie_list = movie_data_df['Row+Labels']

#componenet of the url and api key
api_key = f'api_key={tmdb_api_key}'

base_url = 'https://api.themoviedb.org/3/search/movie?'


In [4]:
# Create counters
record_count = 1
set_count = 1
movie_data = []

#loop through the movies and fetch data
for i, movie in enumerate(movie_list):
    # Group movies in sets of 50 for logging purposes
    if (i % 50 == 0 and i >= 50):
        set_count += 1
        record_count = 0

    # Create endpoint URL with each movie
    query_url = f'{base_url}{api_key}&query={movie}'

    # Log the url, record, and set numbers
    print("Processing Record %s of Set %s | %s" % (record_count, set_count, movie))

    # Add 1 to the record count
    record_count += 1

    # Run an API request for each movie
    try:
        # Parse the JSON and retrieve data
        movie_res = requests.get(query_url)
        response = movie_res.json()

        # Parse out original title and ID
        movie_id = response["results"][0]["id"]
        original_title = response['results'][0]['original_title']

        #Append the movie information into movie_data list
        movie_data.append({"ID": movie_id, 
                          "Original Title": original_title, 
                          })

    # If an error is experienced, skip the movie
    except:
        print("Movie not found. Skipping...")
        pass
              
# Indicate that Data Loading is complete 
print("-----------------------------")
print("Data Retrieval Complete      ")
print("-----------------------------")


Processing Record 1 of Set 1 | Cast+Away
Processing Record 2 of Set 1 | Scary+Movie
Processing Record 3 of Set 1 | How+the+Grinch+Stole+Christmas
Processing Record 4 of Set 1 | What+Women+Want
Processing Record 5 of Set 1 | Meet+the+Parents
Processing Record 6 of Set 1 | Mission:+Impossible+II
Processing Record 7 of Set 1 | Big+Momma's+House
Processing Record 8 of Set 1 | Remember+the+Titans
Processing Record 9 of Set 1 | Gladiator
Processing Record 10 of Set 1 | X-Men
-----------------------------
Data Retrieval Complete      
-----------------------------


In [40]:
#create dataframe from dictionary
movie_data_df = pd.DataFrame.from_dict(movie_data)

Unnamed: 0,ID,Original Title,Popularity
0,8358,Cast Away,50.864
1,4247,Scary Movie,134.378
2,8871,How the Grinch Stole Christmas,70.242
3,3981,What Women Want,28.113
4,1597,Meet the Parents,26.05
5,955,Mission: Impossible II,44.754
6,9600,Big Momma's House,26.24
7,10637,Remember the Titans,40.601
8,98,Gladiator,76.795
9,246655,X-Men: Apocalypse,83.917


In [36]:
#second api request based on movie id
#data to get from second api request
#create the list with the movie ids

ret_movie_id = []
ret_movie_id = movie_data_df['ID']

base_url = 'https://api.themoviedb.org/3/movie/'


{'page': 1,
 'results': [{'adult': False,
              'backdrop_path': '/2ex2beZ4ssMeOduLD0ILzXKCiep.jpg',
              'genre_ids': [28, 12, 878, 14],
              'id': 246655,
              'original_language': 'en',
              'original_title': 'X-Men: Apocalypse',
              'overview': "After the re-emergence of the world's first mutant, "
                          'world-destroyer Apocalypse, the X-Men must unite to '
                          'defeat his extinction level plan.',
              'popularity': 83.917,
              'poster_path': '/lRxsDK4exeEgKoXqI4zdr0Vl0yk.jpg',
              'release_date': '2016-05-18',
              'title': 'X-Men: Apocalypse',
              'video': False,
              'vote_average': 6.516,
              'vote_count': 11870},
             {'adult': False,
              'backdrop_path': '/fctQU5MoXgJ5pNMljFzlEFXwfSu.jpg',
              'genre_ids': [28, 12, 878],
              'id': 127585,
              'original_language': 'en'

In [None]:
#loop through the movies and fetch details in second api call

record_count = 1
set_count = 1
movie_details = []

for i, movie in enumerate(ret_movie_id):
    # Group movies in sets of 50 for logging purposes
    if (i % 50 == 0 and i >= 50):
        set_count += 1
        record_count = 0
    
    # Create endpoint URL with each movie
    query_url = f'{base_url}{movie}?{api_key}'
    
    # Log the url, record, and set numbers
    print("Processing Record %s of Set %s | %s" % (record_count, set_count, movie))

    # Add 1 to the record count
    record_count += 1

    # Run an API request for each of the cities
    try:
        # Parse the JSON and retrieve data
        movie_res = requests.get(query_url)
        response = movie_res.json()

        # define the variables to get from json object, except genres
        budget = response['budget']
        movie_id = response['id']
        imdb_id = response['imdb_id']
        popularity = response['popularity']
        release_date = response['release_date']
        revenue = response['revenue']
        runtime = response['runtime']
        vote_avg = response['vote_average']
        vote_count = response['vote_count']

        #Append the movie information into movie_data list
        movie_details.append({'ID': movie_id,
                              'Budget': budget,
                              "Popularity": popularity,
                              "Release Date": release_date,
                              "Revenue": revenue,
                              "Runtime": runtime,
                              "Vote Average": vote_avg,
                              "Vote Count": vote_count,
                              'IMDB': imdb_id,
       })


    # If an error is experienced, skip the movie
    except:
        print("Movie not found. Skipping...")
        pass
              
# Indicate that Data Loading is complete 
print("-----------------------------")
print("Data Retrieval Complete      ")
print("-----------------------------")

In [None]:
#change movie_details dictionary to a dataframe
movie_details_df = pd.DataFrame.from_dict(movie_details)
movie_details_df

In [None]:
#grab the movie genres

ret_movie_id = movie_data_df['ID']

In [None]:
#loop through the movies and fetch movie genres

record_count = 1
set_count = 1
movie_genres = []

for i, movie in enumerate(ret_movie_id):
    # Group movies in sets of 50 for logging purposes
    if (i % 50 == 0 and i >= 50):
        set_count += 1
        record_count = 0
    
    # Create endpoint URL with each movie specific for genres
    query_url = f'{base_url}{movie}?{api_key}&append_to_response=genres'
    
    # Log the url, record, and set numbers
    print("Processing Record %s of Set %s | %s" % (record_count, set_count, movie))

    # Add 1 to the record count
    record_count += 1

    # Run an API request for each of the movies
    try:
        # Parse the JSON and retrieve data
        movie_res = requests.get(query_url)
        response = movie_res.json()
        
        # Parse out budget, popularity,
        genres = response['genres']
        movie_id = response['id']
        
        
        #Append the movie information into genre list
        movie_genres.append({'ID': movie_id, 'Genres': genres, 'Genre List':[]})

    # If an error is experienced, skip the movie
    except:
        print("Movie not found. Skipping...")
        pass



In [None]:
genre_list = []
final_list = []
id_list = []

for i in range(3065):
    final_list = []
    #go through all the movie ids in the movie_id list
    movie_id = movie_genres[i]['ID']
    
    #get the number of genre records for the movie in the list
    x = len(movie_genres[i]['Genres'])
    genre_list = []
    
    #got through the returned dictionary of movie genres for each movie id
    #pull out just the Name and create a temporary genre_list
    for j in range(x):
        genre_list.append(movie_genres[i]['Genres'][j]['name'])
    
    #add the temporary genre_list to the final_list
    final_list.append(genre_list)
    
    #add the final list of genres to the dictionary with the movie ID
    id_list.append({'ID': movie_id, 'Genre List': final_list})

In [None]:
#take the list of movie ids and genres and create a dataframe
movie_genres_df = pd.DataFrame.from_dict(id_list)
movie_genres_df

In [None]:
# Export the tmdb genres info into a csv
movie_genres_df.to_csv("data/tmdb_genres.csv", index_label="tmdb_ID")