In [1]:
# Import Dependencies
import pandas as pd
import numpy as py
import requests
import json
from config import omdb_api_key
from config import tmdb_api_key

## Import and Clean Netflix Data ##

In [2]:
# Import list of Netflix Titles from CSV
# Source:https://www.kaggle.com/shivamb/netflix-shows
csv_file = "Resources/netflix_titles.csv"
netflix_file_df = pd.read_csv(csv_file)

# verify data reads properly
netflix_file_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...


In [3]:
# drop na on file
netflix_file_df_clean = netflix_file_df.dropna()

# only movies from US
netflix_file_df_clean = netflix_file_df_clean.loc[netflix_file_df_clean["country"] == "United States"]

# verify data reads properly
netflix_file_df_clean.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...
20,80060297,Movie,Manhattan Romance,Tom O'Brien,"Tom O'Brien, Katherine Waterston, Caitlin Fitz...",United States,"September 8, 2017",2014,TV-14,98 min,"Comedies, Independent Movies, Romantic Movies",A filmmaker working on a documentary about lov...


In [4]:
# output cleaned data to a CSV
netflix_file_df_clean.to_csv("Resources/netflix_data_clean.csv", header=True)

## Import and Clean OMDB Data ##

In [5]:
# Load the Netflix Data to use when making API Requests
csv_file = "Resources/netflix_data_clean.csv"
netflix_file_df_clean = pd.read_csv(csv_file)
netflix_file_df_clean.head(2)

Unnamed: 0.1,Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...
1,20,80060297,Movie,Manhattan Romance,Tom O'Brien,"Tom O'Brien, Katherine Waterston, Caitlin Fitz...",United States,"September 8, 2017",2014,TV-14,98 min,"Comedies, Independent Movies, Romantic Movies",A filmmaker working on a documentary about lov...


In [6]:
#iterate through the titles and add information into list or dictionary
url = f"http://www.omdbapi.com/?apikey={omdb_api_key}&t="
titles = netflix_file_df_clean["title"]
movie_name = []
genre = []
language = []
awards = []
metascore = []
boxoffice = []
production = []
imdbrating = []
imdbvotes = []
imdbid = []

for title in titles:
    response = requests.get(url + title)
    response_json = response.json()
    
    try:
        try:
            movie_name.append(response_json['Title'])
        except:
            movie_name.append('na')
        
        try:
            genre.append(response_json['Genre'])
        except:
            genre.append('na')
        
        try:
            language.append(response_json['Language'])
        except:
            language.append('na')
        
        try:
            awards.append(response_json['Awards'])
        except:
            awards.append('na')
        
        try:
            metascore.append(response_json['Metascore'])
        except:
            metascore.append('na')
        
        try:
            boxoffice.append(response_json['BoxOffice'])
        except:
            boxoffice.append('na')
        
        try:
            production.append(response_json['Production'])
        except:
            production.append('na')
        
        try:
            imdbrating.append(response_json['imdbRating'])
        except:
            imdbrating.append('na')
        
        try:
            imdbvotes.append(response_json['imdbVotes'])
        except:
            imdbvotes.append('na')
        
        try:
            imdbid.append(response_json['imdbID'])
        except:
            imdbid.append('na')
    except:
        print(f"Error on line {title}")

# Load Data into a single DataFrame
omdb_data = pd.DataFrame({
    'title':movie_name,
    'genre':genre,
    'language':language,
    'awards':awards,
    'metascore':metascore,
    'boxoffice':boxoffice,
    'production':production,
    'imdbrating':imdbrating,
    'imdbvotes':imdbvotes,
    'imdbid':imdbid
})

# Verify Data Loaded Okay
omdb_data.head(2)

Unnamed: 0,title,genre,language,awards,metascore,boxoffice,production,imdbrating,imdbvotes,imdbid
0,na,na,na,na,na,na,na,na,na,na
1,Manhattan Romance,"Comedy, Drama, Romance",English,3 wins.,,,Beacon Films Inc.,5.5,1122,tt2608324


In [7]:
# remove any results where the title was na
omdb_data_clean = omdb_data
omdb_data_clean = omdb_data_clean.loc[omdb_data_clean["title"]!= 'na']
omdb_data_clean.head(2)

Unnamed: 0,title,genre,language,awards,metascore,boxoffice,production,imdbrating,imdbvotes,imdbid
1,Manhattan Romance,"Comedy, Drama, Romance",English,3 wins.,,,Beacon Films Inc.,5.5,1122,tt2608324
2,Stonehearst Asylum,"Drama, Horror, Thriller",English,1 win & 1 nomination.,52.0,,Millenium Entertainment,6.8,48493,tt1772264


In [8]:
# save to csv
omdb_data_clean.to_csv("Resources/omdb_data_clean.csv", index=True)

## Import and Clean TMDB Data##

In [9]:
# Using the same Netflix list from the previous step, use API to pull data from TMBD
url = f"https://api.themoviedb.org/3/search/movie?api_key={tmdb_api_key}&query="
titles = netflix_file_df_clean["title"]
movie_name = []
original_title = []
popularity = []
vote_count = []
vote_average = []
poster_path = []

for title in titles:
    title = title.replace(" ", "+")
    response = requests.get(url + title)
    response_json = response.json()
    #print(response)
        
    try:
           movie_name.append(response_json['results'][0]['title'])
    except:
           movie_name.append('na')
        
    try:
           original_title.append(response_json['results'][0]['original_title'])
    except:
           original_title.append('na')
        
    try:
           popularity.append(response_json['results'][0]['popularity'])
    except:
           popularity.append('na')
       
    try:
           vote_count.append(response_json['results'][0]['vote_count'])
    except:
           vote_count.append('na')

    try:
           vote_average.append(response_json['results'][0]['vote_average'])
    except:
           vote_average.append('na')
            
    try:
           poster_path.append("https://image.tmdb.org/t/p/w500" + response_json['results'][0]['poster_path'])
    except:
           poster_path.append('na')
            
#load into a single dataframe
tmdb_data = pd.DataFrame({
    'title':movie_name,
    'original_title':original_title,
    'popularity':popularity,
    'vote_count':vote_count,
    'vote_average':vote_average,
    'poster_path':poster_path
})

# verify data loaded
tmdb_data.head(2)

Unnamed: 0,title,original_title,popularity,vote_count,vote_average,poster_path
0,na,na,na,na,na,na
1,Manhattan Romance,Manhattan Romance,3.765,24,4.8,na


In [10]:
# remove any results where the title was na
tmdb_data_clean = tmdb_data
tmdb_data_clean = tmdb_data_clean.loc[tmdb_data_clean["title"]!= 'na']
tmdb_data_clean.head(2)

Unnamed: 0,title,original_title,popularity,vote_count,vote_average,poster_path
1,Manhattan Romance,Manhattan Romance,3.765,24,4.8,na
2,Stonehearst Asylum,Stonehearst Asylum,17.405,817,6.7,https://image.tmdb.org/t/p/w500/fZxGCCQ0NAtrae...


In [11]:
# save to csv
tmdb_data_clean.to_csv("Resources/tmdb_data_clean.csv", index=True)