# Movies Database Seach

by Israel Diaz

### Load Libraries

In [1]:
## General Libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import os, math, time,json

from tqdm.notebook import tqdm_notebook
warnings.simplefilter('ignore')

##Specific
import tmdbsimple as tmdb

## specifying data folder
FOLDER = "data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['basics.csv.gz',
 'aka.csv.gz',
 'ratings.csv.gz',
 'final_tmdb_data_2001.csv.gz']

### Loading DataFiles
<h1>***Jump to section 2 if you already filtered and saved the data***</h1>

In [2]:
%%time
title_basics = 'https://datasets.imdbws.com/title.basics.tsv.gz'
title_aka = 'https://datasets.imdbws.com/title.akas.tsv.gz'
title_ratings = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

## loading urls into pandas dataframe
basics = pd.read_csv(title_basics, sep='\t', low_memory='False')
aka = pd.read_csv(title_aka, sep='\t', low_memory='False')
ratings = pd.read_csv(title_ratings, sep='\t', low_memory='False')


KeyboardInterrupt



In [3]:
print('basics = ', 'len:',len(basics))
display(basics.head())
print('\naka = ', 'len:',len(aka))
display(aka.head())
print('\nratings = ', 'len:',len(ratings))
display(ratings.head())


KeyboardInterrupt



### Data Cleaning

#### Replacing \N with NaN

The data dictionary reports that the files shows null values as \N, this would be an issue so it must be changed to NaN values.

In [None]:
## Replacing the \N values with NaN
basics.replace({'\\N': np.nan}, inplace=True)
aka.replace({'\\N': np.nan}, inplace=True)
ratings.replace({'\\N': np.nan}, inplace=True)

In [None]:
print('basics')
display(basics.head())
print('\naka')
display(aka.head())
print('\nratings')
display(ratings.head())

#### Eliminate runtimeMinute = Null

In [None]:
basics.dropna(subset=['runtimeMinutes', 'genres'], axis=0, inplace=True)

#### Keeping only titleType = movie

In [None]:
basics = basics[basics['titleType'] == 'movie']

#### Keeping startYear between 2000-2002

In [None]:
basics = basics[(basics['startYear'] >= '2000') & (basics['startYear'] < '2023')]

#### Dropping movies that contain documentaries

In [None]:
## filtering movies that contain documentaries in genres
is_documentary = basics['genres'].str.contains('documentary', case=False)
##saving
basics = basics[~is_documentary]
## showing results
basics.head(10)

#### Keeping only US movies

In [None]:
aka = aka[aka['region'] == 'US']

Applying to all other sets

In [None]:
# Filtering basics
us_movies = basics['tconst'].isin(aka['titleId'])
# results
us_movies[:10]

In [None]:
#Apply to the basics set
basics = basics[us_movies]

In [None]:
## filtering ratings
us_ratings = ratings['tconst'].isin(aka['titleId'])
# results
us_ratings[:10]

In [None]:
## Apply to ratings se
ratings = ratings[us_ratings]

## Saving DataFrames

In [None]:
import os
os.makedirs('data/', exist_ok=True)
os.listdir('data/')

In [None]:
## saving basics to compressed file
basics.to_csv("data/basics.csv.gz",compression='gzip',index=False)
## saving aka to compressed file
aka.to_csv("data/aka.csv.gz",compression='gzip',index=False)
## saving ratings to compressed file
ratings.to_csv("data/ratings.csv.gz",compression='gzip',index=False)

<h1>Start here if you have BASICS file in the data folder</h1>

## Complementing budget, revenue, and MPAA Rating (G/PG/PG-13/R)

To add more relevant data such as Budget, Revenue and MPAA Rating, I'll use the TMDB API

### Defining Functions

In [2]:
#@Functions

def write_json(new_data, filename):
    """Appends a list of records (new_data) to a json file (filename).
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""

    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

def get_movie_with_rating(movie_id):
    """ Adapted from source github/celuao/tmdbsimple"""
    #Get the movie object for the current id
    movie = tmdb.Movies(movie_id)

    # Save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()

    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation == US
        if c['iso_3166_1'] == 'US':
            # save a 'certification' key in info with the certification
            info['certification'] = c['certification']

    return info



### Loading API KEY

In [3]:
with open('C:/Users/diazi/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-v3', 'api-key-v4'])

In [4]:
tmdb.API_KEY =  login['api-key-v3']

### Loading basics Dataframe

In [5]:
# Load in the dataframe:
basics = pd.read_csv(filepath_or_buffer=FOLDER+'basics.csv.gz')

In [6]:
## save years into a variable
YEARS_TO_GET = basics['startYear'].drop_duplicates().sort_values(ascending=True)
YEARS_TO_GET

8       2000
0       2001
4       2002
37      2003
7       2004
3       2005
6       2006
24      2007
15      2008
14      2009
1132    2010
331     2011
1602    2012
1785    2013
4308    2014
5615    2015
3881    2016
5       2017
2       2018
6325    2019
1       2020
560     2021
28      2022
Name: startYear, dtype: int64

In [7]:
## declare error variable for further consultations
errors = []

In [None]:
#@ Loop over Years and save results into json files

for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

    df = basics.loc[basics['startYear']==YEAR].copy()
    movie_ids = df['tconst'].copy()
    previous_df = pd.read_json(JSON_FILE)
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    # Iterating over movie ID
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            temp = get_movie_with_rating(movie_id)
            write_json(temp,JSON_FILE)
            time.sleep(0.02)

        except Exception as e:
            errors.append([movie_id, e])

YEARS:   0%|          | 0/23 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1432 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1548 [00:00<?, ?it/s]

Movies from 2002:   0%|          | 0/1550 [00:00<?, ?it/s]

Movies from 2003:   0%|          | 0/1666 [00:00<?, ?it/s]

Movies from 2004:   0%|          | 0/1882 [00:00<?, ?it/s]

Movies from 2005:   0%|          | 0/2167 [00:00<?, ?it/s]

Movies from 2006:   0%|          | 0/2397 [00:00<?, ?it/s]

Movies from 2007:   0%|          | 0/2548 [00:00<?, ?it/s]

Movies from 2008:   0%|          | 0/2880 [00:00<?, ?it/s]

Movies from 2009:   0%|          | 0/3523 [00:00<?, ?it/s]

Movies from 2010:   0%|          | 0/3830 [00:00<?, ?it/s]

Movies from 2011:   0%|          | 0/4199 [00:00<?, ?it/s]

Movies from 2012:   0%|          | 0/4491 [00:00<?, ?it/s]

Movies from 2013:   0%|          | 0/4679 [00:00<?, ?it/s]

Movies from 2014:   0%|          | 0/4883 [00:00<?, ?it/s]

Movies from 2015:   0%|          | 0/5016 [00:00<?, ?it/s]

In [None]:
print(f"- Total errors: {len(errors)}")

In [None]:
print(errors[:10])

# STILL DOWNLOADING DATA FROM TMDB