# A. Project Name:  IMDb Successful Movie.
- **Student Name:** Eduardo Galindez.
- **Coding Dojo Bootcamp:** Data Science.
  - **Stack:** Data Enrichment.
- **Date:** September 23th, 2022.

# B. Project Objective
The objective of this Notebook is to download the remaining data from Part B.

# C. Project Development

## 1.- Libraries & Functions

In [1]:
# Libraries.
import numpy as np
import pandas as pd
import tmdbsimple as tmdb 
import matplotlib.pyplot as plt
import seaborn as sns
import os, time, json
os.makedirs('Data', exist_ok=True)

from tqdm.notebook import tqdm_notebook

In [2]:
# Function to get the certification.
def get_movie_certification(movie_id):
    movie = tmdb.Movies(movie_id)
    info = movie.info()
    releases = movie.releases()
    
    for c in releases['countries']:
        if c['iso_3166_1'] == "US":
            info['certifcation'] = c['certification']     
    return info

In [3]:
# Function to create our .json file.
##  Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/

def write_json(new_data, filename):    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

## 2.-  Data & Connection

### 2.1.- API connection


In [4]:
# Loading API credentials.
with open('/Users/eduar/.secret/tmdb_api.json', 'r') as file:
    login = json.load(file)
login.keys()

dict_keys(['api-key'])

In [5]:
# Import credentials.
tmdb.API_KEY =  login['api-key']

In [6]:
# Checking the connection with 'The Avengers'.
the_avengers_movie = tmdb.Movies('tt0848228')
the_avengers_info = the_avengers_movie.info()
the_avengers_info['budget']

220000000

In [7]:
# Checking the connection with 'The Notebook'.
the_notebook_movie = tmdb.Movies('tt0332280')
the_notebook_info = the_notebook_movie.info()
the_notebook_info['budget']

29000000

### 2.2.- Mount and loading data.

In [13]:
# Specify folder for saving data.
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['.ipynb_checkpoints',
 'Chunk data per database',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'final_tmdb_data_2002.csv.gz',
 'final_tmdb_data_2003.csv.gz',
 'final_tmdb_data_2004.csv.gz',
 'final_tmdb_data_2006.csv.gz',
 'final_tmdb_data_2007.csv.gz',
 'final_tmdb_data_2008.csv.gz',
 'final_tmdb_data_2009.csv.gz',
 'final_tmdb_data_2010.csv.gz',
 'final_tmdb_data_2011.csv.gz',
 'final_tmdb_data_2012.csv.gz',
 'final_tmdb_data_2013.csv.gz',
 'final_tmdb_data_2014.csv.gz',
 'final_tmdb_data_2015.csv.gz',
 'final_tmdb_data_2016.csv.gz',
 'final_tmdb_data_2017.csv.gz',
 'final_tmdb_data_2018.csv.gz',
 'final_tmdb_data_2019.csv.gz',
 'final_tmdb_data_2020.csv.gz',
 'final_tmdb_data_2021.csv.gz',
 'final_tmdb_data_2022.csv.gz',
 'genres.csv.gz',
 'Original data',
 'title_akas_combined.csv.gz',
 'title_basics.csv.gz',
 'title_basics_combined.csv.gz',
 'title_genres.csv.gz',
 'title_ratings.csv.gz',
 'title_ratings_combined.csv.gz',
 'tmbd_data.csv.gz',
 'tmdb_api_result

In [9]:
# Load in the dataframe from Part A:
basics_df = pd.read_csv('./Data/title_basics_combined.csv.gz', low_memory = False)

In [10]:
# Create Required Lists for the our function.
YEARS_TO_GET = range(2000, 2023)
errors = [ ]

In [11]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    # Defining the JSON file to store results for year.
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    # Check if the file exists.
    file_exists = os.path.isfile(JSON_FILE)
    # If it does exist: notify me.
    if file_exists == True:
        print(f'{YEAR} {JSON_FILE} already exists.')
    # If it does not exist: create it.
    else:
    # Save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE,'w') as file:
            json.dump([{'imdb_id':0}], file)

    # Saving new year as the current df.        
    df = basics_df.loc[basics_df['startYear'] == YEAR].copy()
    # Saving movie ids to list.
    movie_ids = df['tconst'].copy()
    
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    
    # Filter out any ids that are already in the JSON_FILE.
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

    
    # Start of INNER Loop.
    if file_exists == False:
        for movie_id in tqdm_notebook(movie_ids_to_get,
                                      desc=f'Movies from {YEAR}',
                                      position=1,
                                      leave=True):
            try:
                # Retrieve then data for the movie id.
                temp = get_movie_certification(movie_id)  
                # Append/extend results to existing file using a pre-made function.
                write_json(temp,JSON_FILE)
                # Short 20 ms sleep to prevent overwhelming server.
                time.sleep(0.02)

            except Exception as e:
                errors.append([movie_id, e])

        final_year_df = pd.read_json(JSON_FILE)
        final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

print(f"- Total errors: {len(errors)}")

YEARS:   0%|          | 0/23 [00:00<?, ?it/s]

2000 Data/tmdb_api_results_2000.json already exists.
2001 Data/tmdb_api_results_2001.json already exists.
2002 Data/tmdb_api_results_2002.json already exists.
2003 Data/tmdb_api_results_2003.json already exists.
2004 Data/tmdb_api_results_2004.json already exists.
2005 Data/tmdb_api_results_2005.json already exists.
2006 Data/tmdb_api_results_2006.json already exists.
2007 Data/tmdb_api_results_2007.json already exists.
2008 Data/tmdb_api_results_2008.json already exists.
2009 Data/tmdb_api_results_2009.json already exists.
2010 Data/tmdb_api_results_2010.json already exists.
2011 Data/tmdb_api_results_2011.json already exists.
2012 Data/tmdb_api_results_2012.json already exists.
2013 Data/tmdb_api_results_2013.json already exists.
2014 Data/tmdb_api_results_2014.json already exists.
2015 Data/tmdb_api_results_2015.json already exists.
2016 Data/tmdb_api_results_2016.json already exists.
2017 Data/tmdb_api_results_2017.json already exists.
2018 Data/tmdb_api_results_2018.json already e

In [14]:
# Let's load data from 2000 & 2001.
movies_from_2000_df = pd.read_csv('./Data/final_tmdb_data_2000.csv.gz', low_memory = False)
movies_from_2001_df = pd.read_csv('./Data/final_tmdb_data_2001.csv.gz', low_memory = False)
movies_from_2002_df = pd.read_csv('./Data/final_tmdb_data_2002.csv.gz', low_memory = False)
movies_from_2003_df = pd.read_csv('./Data/final_tmdb_data_2003.csv.gz', low_memory = False)
movies_from_2004_df = pd.read_csv('./Data/final_tmdb_data_2004.csv.gz', low_memory = False)
#movies_from_2005_df = pd.read_csv('./Data/final_tmdb_data_2005.csv.gz', low_memory = False) # 2005 data is not available from the source.

In [15]:
movies_from_2006_df = pd.read_csv('./Data/final_tmdb_data_2006.csv.gz', low_memory = False)
movies_from_2007_df = pd.read_csv('./Data/final_tmdb_data_2007.csv.gz', low_memory = False)
movies_from_2008_df = pd.read_csv('./Data/final_tmdb_data_2008.csv.gz', low_memory = False)
movies_from_2009_df = pd.read_csv('./Data/final_tmdb_data_2009.csv.gz', low_memory = False)
movies_from_2010_df = pd.read_csv('./Data/final_tmdb_data_2010.csv.gz', low_memory = False)

In [16]:
movies_from_2011_df = pd.read_csv('./Data/final_tmdb_data_2011.csv.gz', low_memory = False)
movies_from_2012_df = pd.read_csv('./Data/final_tmdb_data_2012.csv.gz', low_memory = False)
movies_from_2013_df = pd.read_csv('./Data/final_tmdb_data_2013.csv.gz', low_memory = False)
movies_from_2014_df = pd.read_csv('./Data/final_tmdb_data_2014.csv.gz', low_memory = False)
movies_from_2015_df = pd.read_csv('./Data/final_tmdb_data_2015.csv.gz', low_memory = False)

In [17]:
movies_from_2016_df = pd.read_csv('./Data/final_tmdb_data_2016.csv.gz', low_memory = False)
movies_from_2017_df = pd.read_csv('./Data/final_tmdb_data_2017.csv.gz', low_memory = False)
movies_from_2018_df = pd.read_csv('./Data/final_tmdb_data_2018.csv.gz', low_memory = False)
movies_from_2019_df = pd.read_csv('./Data/final_tmdb_data_2019.csv.gz', low_memory = False)
movies_from_2020_df = pd.read_csv('./Data/final_tmdb_data_2020.csv.gz', low_memory = False)

In [18]:
movies_from_2021_df = pd.read_csv('./Data/final_tmdb_data_2021.csv.gz', low_memory = False)
movies_from_2022_df = pd.read_csv('./Data/final_tmdb_data_2022.csv.gz', low_memory = False)

In [20]:
# Concatenate them.
movies_from_2000_and_2022_df = pd.concat([movies_from_2000_df, movies_from_2001_df, movies_from_2002_df,
                                         movies_from_2003_df, movies_from_2004_df,
                                         movies_from_2006_df, movies_from_2007_df, movies_from_2008_df,
                                         movies_from_2009_df, movies_from_2010_df, movies_from_2011_df,
                                         movies_from_2012_df, movies_from_2013_df, movies_from_2014_df,
                                         movies_from_2015_df, movies_from_2016_df, movies_from_2017_df,
                                         movies_from_2018_df, movies_from_2019_df, movies_from_2020_df,
                                         movies_from_2021_df, movies_from_2022_df])
movies_from_2000_and_2022_df.head(5)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certifcation
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.wkw-inthemoodforlove.com/,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.11,1984.0,PG


In [22]:
# Check for invalid data in 'imdb_id'.
movies_from_2000_and_2022_df[movies_from_2000_and_2022_df['imdb_id'] == '0']#

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certifcation
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,
0,0,,,,,,,,,,...,,,,,,,,,,


In [23]:
# Let's drop rows with values=0.
movies_from_2000_and_2022_df.drop(index=movies_from_2022_df.index[0], axis=0, inplace=True)
movies_from_2000_and_2022_df

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certifcation
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.500,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.100,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.000,0.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.wkw-inthemoodforlove.com/,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.110,1984.0,PG
5,tt0118852,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,49511.0,en,Chinese Coffee,...,0.0,99.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's a fine line between friendship and bet...,Chinese Coffee,0.0,6.851,47.0,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3718,tt9895024,0.0,/S1kutYyoyuBJKGS0mXxv2fZNbr.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,834443.0,de,Heikos Welt,...,0.0,118.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,Heiko's World,0.0,0.000,0.0,
3719,tt9896876,0.0,/hMvRbT6HOqERhh3K8kXbaLz9LlZ.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,821493.0,en,India Sweets and Spices,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Let the aunties talk.,India Sweets and Spices,0.0,5.000,4.0,PG-13
3720,tt9898844,0.0,/q2KFBGyUSzHDhNqXEYv2LqTWVSz.jpg,,0.0,"[{'id': 27, 'name': 'Horror'}]",,870671.0,en,The Hunting,...,0.0,91.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Hunger runs deep.,The Hunting,0.0,4.893,28.0,
3721,tt9900940,0.0,/3jjWLg5bevWqReyVroYqxwVrH0k.jpg,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,861294.0,en,The Scrapper,...,0.0,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,You can't escape your past.,The Scrapper,0.0,6.000,2.0,


In [24]:
# Have a general look of the data.
movies_from_2000_and_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59207 entries, 1 to 3722
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                59207 non-null  object 
 1   adult                  59207 non-null  float64
 2   backdrop_path          37385 non-null  object 
 3   belongs_to_collection  3808 non-null   object 
 4   budget                 59207 non-null  float64
 5   genres                 59207 non-null  object 
 6   homepage               14559 non-null  object 
 7   id                     59207 non-null  float64
 8   original_language      59207 non-null  object 
 9   original_title         59207 non-null  object 
 10  overview               57917 non-null  object 
 11  popularity             59207 non-null  float64
 12  poster_path            54089 non-null  object 
 13  production_companies   59207 non-null  object 
 14  production_countries   59207 non-null  object 
 15  rel

In [25]:
# Statistical summary.
movies_from_2000_and_2022_df.describe()

Unnamed: 0,adult,budget,id,popularity,revenue,runtime,video,vote_average,vote_count
count,59207.0,59207.0,59207.0,59207.0,59207.0,59207.0,59207.0,59207.0,59207.0
mean,0.001757,3062569.0,377360.5,5.946111,8152235.0,91.589981,0.007212,4.596523,211.397858
std,0.041875,16023340.0,258074.1,18.721882,61330710.0,31.632223,0.084617,2.642191,1121.20485
min,0.0,0.0,12.0,0.6,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,133970.5,0.848,0.0,84.0,0.0,3.1,1.0
50%,0.0,0.0,370460.0,1.893,0.0,93.0,0.0,5.461,6.0
75%,0.0,0.0,571467.0,4.8565,0.0,105.0,0.0,6.418,34.0
max,1.0,380000000.0,1032061.0,1075.4,2847246000.0,1440.0,1.0,10.0,32369.0


In [26]:
# Download our concatenated database.
movies_from_2000_and_2022_df.to_csv(f'./Data/tmdb_results_combined.csv.gz', compression='gzip', index=False)