# Model Inference
Now that we have finally finished training our respective models, we are ready to craft the inference script that will generate scores from each of the respective models. Once this inference script is complete, we will make use of it as part of a FastAPI API that will serve a tiny webpage on the Heroku platform!

## Project Setup
Before jumping into the inference script, we will need to do some setup to import things like Pandas libraries, API keys, helper functions, and of course, the serialized model pipelines.

In [1]:
# Importing the necessary Python libraries
import sys
import json
import yaml
import cloudpickle
import numpy as np
import pandas as pd
import tmdbv3api
from imdb import IMDb
from omdb import OMDBClient
from rotten_tomatoes_scraper.rt_scraper import MovieScraper

In [2]:
# Importing the inference helper functions
sys.path.insert(0, '../container/model-training/')
from helpers import *

In [3]:
# Loading the API keys from the separate, secret YAML file
with open('../keys/keys.yml', 'r') as f:
    keys_yaml = yaml.safe_load(f)

# Extracting the API keys from the loaded YAML
tmdb_key = keys_yaml['api_keys']['tmdb_key']
omdb_key = keys_yaml['api_keys']['omdb_key']

In [4]:
# Loading the models from the serialized pickle files
with open('../models/binary_classification_pipeline.pkl', 'rb') as f:
    binary_classification_pipeline = cloudpickle.load(f)
with open('../models/regression_pipeline.pkl', 'rb') as f:
    regression_pipeline = cloudpickle.load(f)

In [5]:
# Defining which features to keep from each respective source
TMDB_FEATS = ['tmdb_id', 'imdb_id', 'budget', 'primary_genre', 'secondary_genre',
              'tmdb_popularity', 'revenue', 'runtime', 'tmdb_vote_average', 'tmdb_vote_count']
IMDB_FEATS = ['imdb_rating', 'imdb_votes', 'year']
OMDB_FEATS = ['rt_critic_score', 'metascore']
ROTT_FEATS = ['rt_audience_score']
ALL_FEATS = TMDB_FEATS + IMDB_FEATS + OMDB_FEATS + ROTT_FEATS

In [6]:
# Instantiating the TMDb objects and setting the API key
tmdb = tmdbv3api.TMDb()
tmdb_search = tmdbv3api.Search()
tmdb_movies = tmdbv3api.Movie()
tmdb.api_key = tmdb_key

# Instantiating the IMDbPY search object
imdb_search = IMDb()

# Instantiating the OMDb client
omdb_client = OMDBClient(apikey = omdb_key)

In [7]:
# Creating initial Pandas DataFrame
df = pd.DataFrame(data = ['The Matrix'], columns = ['movie_name'])

In [8]:
# Extracting the demo movie name from df
movie_name = df['movie_name'][0]

## Data Gathering
Before we can produce an inference on a particular movie, we will need to gather the raw data from the APIs and craft them appropriately in a Pandas DataFrame.

### TMDb Data Gathering

In [9]:
# Getting TMDb full search results
tmdb_search_results = tmdb_search.movies({'query': movie_name})

In [10]:
# Extracting tmdb_id if search results exist
if len(tmdb_search_results) != 0:
    tmdb_id = tmdb_search_results[0]['id']
else:
    print(f'Results not found for title: {movie_name}.')

In [11]:
# Getting the details of the movie using the tmdb_id
tmdb_details = dict(tmdb_movies.details(tmdb_id))

In [12]:
# Adding tmdb_id to tmdb_details dictionary
tmdb_details['tmdb_id'] = tmdb_id

In [13]:
# Checking the length of TMDb genres to see if there is a secondary genre
tmdb_genre_length = len(tmdb_details['genres'])

In [14]:
# Separating the primary_genre from the 'genres' nested child dictionary if it exists
if tmdb_genre_length == 0:
    tmdb_details['primary_genre'] = np.nan
else:
    tmdb_details['primary_genre'] = tmdb_details['genres'][0]['name']

# Separating the secondary_genre from the 'genres' nested child dictionary if it exists
if tmdb_genre_length >= 2:
    tmdb_details['secondary_genre'] = tmdb_details['genres'][1]['name']
else:
    tmdb_details['secondary_genre'] = np.nan

In [15]:
# Renaming some TMDb columns appropriately
tmdb_details['tmdb_popularity'] = tmdb_details.pop('popularity')
tmdb_details['tmdb_vote_average'] = tmdb_details.pop('vote_average')
tmdb_details['tmdb_vote_count'] = tmdb_details.pop('vote_count')

In [16]:
# Adding the TMDb features to df
for feat in TMDB_FEATS:
    df[feat] = tmdb_details[feat]

In [17]:
# Viewing what df looks like after getting TMDb data
df

Unnamed: 0,movie_name,tmdb_id,imdb_id,budget,primary_genre,secondary_genre,tmdb_popularity,revenue,runtime,tmdb_vote_average,tmdb_vote_count
0,The Matrix,624860,tt10838180,190000000,Science Fiction,Action,975.279,156497322,148,6.8,3117


### IMDb Data Gathering

In [18]:
# Getting imdb_id from TMDb output and removing unnecessary characters
imdb_id = df['imdb_id'][0]
imdb_id = imdb_id[2:]
imdb_id

'10838180'

In [19]:
# Using IMDbPY to get movie details using the IMDb ID
imdb_details = dict(imdb_search.get_movie(imdb_id))

In [20]:
# Renaming the features appropriately
imdb_details['imdb_rating'] = imdb_details.pop('rating')
imdb_details['imdb_votes'] = imdb_details.pop('votes')

In [21]:
# Adding the IMDb features to df
for feat in IMDB_FEATS:
    df[feat] = imdb_details[feat]

In [22]:
# Viewing what df looks like after getting IMDb data
df

Unnamed: 0,movie_name,tmdb_id,imdb_id,budget,primary_genre,secondary_genre,tmdb_popularity,revenue,runtime,tmdb_vote_average,tmdb_vote_count,imdb_rating,imdb_votes,year
0,The Matrix,624860,tt10838180,190000000,Science Fiction,Action,975.279,156497322,148,6.8,3117,5.7,201964,2021


### OMDb Data Gathering

In [23]:
# Using the OMDb client to search for the movie results using the IMDb ID
omdb_details = omdb_client.imdbid(df['imdb_id'][0])

In [24]:
# Setting the Rotten Tomatoes critic score based on availability
if len(omdb_details['ratings']) > 0:
    for rater in omdb_details['ratings']:
        if rater['source'] == 'Rotten Tomatoes':
            omdb_details['rt_critic_score'] = rater['value']
else:
    omdb_details['rt_critic_score'] = np.nan

In [25]:
# Adding the OMDb features to df
for feat in OMDB_FEATS:
    df[feat] = omdb_details[feat]

In [26]:
# Viewing what df looks like after getting OMDb data
df

Unnamed: 0,movie_name,tmdb_id,imdb_id,budget,primary_genre,secondary_genre,tmdb_popularity,revenue,runtime,tmdb_vote_average,tmdb_vote_count,imdb_rating,imdb_votes,year,rt_critic_score,metascore
0,The Matrix,624860,tt10838180,190000000,Science Fiction,Action,975.279,156497322,148,6.8,3117,5.7,201964,2021,63%,63


### Rotten Tomatoes Data Gathering

In [27]:
# Setting the Rotten Tomatoes audience score to be null if RT critic score is not present from OMDb output
if str(df['rt_critic_score'][0]) == 'nan':
    rt_movie_details = {'rt_audience_score': np.nan}
else:
    # Setting the Rotten Tomatoes audience score appropriately from the RT scraper object if present
    try:
        # Getting the movie metadata from the RT scraper
        movie_name = df['movie_name'][0]
        rt_movie_scraper = MovieScraper(movie_title = movie_name)
        rt_movie_scraper.extract_metadata()
        
        # Extracting the critic and audience scores from the metadata
        rt_critic_score = rt_movie_scraper.metadata['Score_Rotten']
        rt_audience_score = rt_movie_scraper.metadata['Score_Audience']
        
        # Comparing the rt_critic_score from the RT scraper to the OMDb output
        if rt_critic_score == df['rt_critic_score'][0][:2]:
            rt_movie_details = {'rt_audience_score': rt_audience_score}
        else:
            rt_movie_details = {'rt_audience_score': np.nan}
        
    # Setting the Rotten Tomatoes audience score to be null if RT critic score is not present from OMDb output
    except:
        rt_movie_details = {'rt_audience_score': np.nan}

In [28]:
# Adding the ROTT features to df
for feat in ROTT_FEATS:
    df[feat] = rt_movie_details[feat]

In [29]:
# Viewing what df looks like after getting Rotten Tomatoes data
df

Unnamed: 0,movie_name,tmdb_id,imdb_id,budget,primary_genre,secondary_genre,tmdb_popularity,revenue,runtime,tmdb_vote_average,tmdb_vote_count,imdb_rating,imdb_votes,year,rt_critic_score,metascore,rt_audience_score
0,The Matrix,624860,tt10838180,190000000,Science Fiction,Action,975.279,156497322,148,6.8,3117,5.7,201964,2021,63%,63,


## Model Inference
Now that we have gotten all our data from the respective APIs, we are finally ready to generate an inference for our movie predictions!

In [30]:
# Getting the inference for the Biehn "yes or no" approval
df['biehn_yes_or_no'] = binary_classification_pipeline.predict(df[ALL_FEATS])

In [31]:
# Getting the inference for the Biehn Scale score
df['biehn_scale_score'] = regression_pipeline.predict(df[ALL_FEATS])

In [32]:
# Viewing the final inference output
df[['movie_name', 'biehn_yes_or_no', 'biehn_scale_score']]

Unnamed: 0,movie_name,biehn_yes_or_no,biehn_scale_score
0,The Matrix,Yes,6.44178
