# Importing Labraries

In [1]:
import pandas as pd
from unidecode import unidecode
import re

In [2]:
df_movies = pd.read_csv("../processed_data/movies.csv") # Present in functions 1, 2, 3, 4 and 6
df_crew = pd.read_csv("../processed_data/crew.csv") # Present in function 6
actor_financial = pd.read_csv("../processed_data/actor_financial.csv") # Present in function 5
director_financial = pd.read_csv("../processed_data/director_financial.csv") # Present in function 6

In [23]:
# everything in lower case to standarize the querys

df_movies["release_month"] = [x.lower() for x in df_movies["release_month"]]
df_movies["release_day"] = [x.lower() for x in df_movies["release_day"]]

# Functions

In this notebook we are going to build the functions to test the API, except the function that calls the recommendation model, the model have its own notebook.

## API Root

Here we can test the welcoming page of our movies API

###  Using jinja2 library (dynamic)

In [2]:
from fastapi import FastAPI, Request # this represent a HTML request
from fastapi.responses import HTMLResponse # output will be an HTML response
from fastapi.templating import Jinja2Templates # import use custom html templates

app = FastAPI() # name of app
templates = Jinja2Templates(directory="../templates") # path of our custom template

@app.get('/', response_class=HTMLResponse)
def welcome(request: Request):
    return templates.TemplateResponse("root.html", {"request": request})

import nest_asyncio 
nest_asyncio.apply() # necessary in a notebook


import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000) # use this url to test http://localhost:8000/

INFO:     Started server process [17980]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:62261 - "GET / HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [17980]


### Without jinja2 library (static)

In [3]:
app = FastAPI() 

@app.get("/", response_class=HTMLResponse)
def welcome(request: Request):
    with open("../templates/root.html", "r") as file:
        html_content = file.read()
    return HTMLResponse(content=html_content, status_code=200)

import nest_asyncio
nest_asyncio.apply()

import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000) # use this url to test http://localhost:8000/


INFO:     Started server process [17980]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:62594 - "GET / HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [17980]


## String Transformation

In [3]:
def string_transformation(text):
    if type(text) == str:
        text = text.lower().strip().replace(" ", "")
        text = unidecode(text)  # delete accents
        text = re.sub(r'[^\w\s]', '', text)  # delete special characters and punctuation marks
        return text
    else:
     return "Entered value is not valid." 

In [14]:
string_transformation("June")

'june'

## 1. Amount of films by month

In [12]:
def cantidad_filmaciones_mes(month: str):
    month = string_transformation(month)
    if month in df_movies["release_month"].unique():
        count_by_month = df_movies.groupby(["release_month"])["title"].count()
        return {month: count_by_month[month].item()} # needs item() because fastapi doesn't process numpy.int64 type objects 
    else:
        return "Entered value is not valid." # if we use print() instead of return, the output will be null


In [25]:
cantidad_filmaciones_mes("january")

{'january': 5909}

In [26]:
# other option to get a dictionary with the answer

count_by_month = df_movies.groupby(["release_month"])["title"].count().reset_index()
index = (count_by_month["release_month"] == "june" ).idxmax()
count_by_month.loc[index, :].to_dict()
 

{'release_month': 'june', 'title': 3151}

In [27]:
count_by_month = df_movies.groupby(["release_month"])["title"].count()
count_by_month

release_month
april        3452
august       3393
december     3781
february     3028
january      5909
july         2638
june         3151
march        3549
may          3336
november     3661
october      4613
september    4834
Name: title, dtype: int64

## 2. Amount of films by day of the week.

In [28]:
def cantidad_filmaciones_dia(day: str):
    day = string_transformation(day)
    if day in df_movies["release_day"].unique():
        count_by_day = df_movies.groupby(["release_day"])["title"].count()
        return {day:count_by_day[day].item()}
    else:
        return "Entered value is not valid." 

In [29]:
cantidad_filmaciones_dia("friday-")

{'friday': 13902}

In [30]:
count_by_day = df_movies.groupby(["release_day"])["title"].count()
count_by_day 

release_day
friday       13902
monday        3500
saturday      5149
sunday        3608
thursday      7520
tuesday       4639
wednesday     7027
Name: title, dtype: int64

## 3. Getting the popularity of a movie introducing the title.

There is a situation with the movies that have to be adressed here, there are duplicates for some movies because there are released in different languages. This function will get the popularity using idxmax() to get the index according to the title. idxmax() will consider only the first occurrence.

In [11]:
df_movies[df_movies["title"] == "12 Angry Men"]

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,tagline,title,...,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list,release_year,return,transformed_title,release_month,release_day
1159,350000.0,389,en,The defense and the prosecution have rested an...,16.503959,1957-03-25,1000000.0,96.0,Life is in their hands. Death is on their minds.,12 Angry Men,...,,['Drama'],['en'],"['United Artists', 'Orion-Nova Productions']",['US'],1957,2.86,12angrymen,marzo,lunes
15187,0.0,12219,en,During the trial of a man accused of his fathe...,3.679263,1997-08-17,0.0,117.0,,12 Angry Men,...,,"['Crime', 'Drama']","['en', 'hu', 'ru']",['MGM Television'],['US'],1997,0.0,12angrymen,agosto,domingo


In [11]:
def score_titulo(title: str):
    title = string_transformation(title)
    df_movies["transformed_title"] = [string_transformation(x) for x in df_movies["title"]]
    if title in df_movies["transformed_title"].unique(): # there are repeated movies (different language)
        index = (df_movies["transformed_title"] == title).idxmax()
        return {
                "Title":df_movies["title"][index], 
                "Year":df_movies["release_year"][index].item(), 
                "Popularity":df_movies["popularity"][index].round(2).item()
                } 
    else:
        return "Entered value is not valid."


In [12]:
score_titulo("12angry, MEN")

{'Title': '12 Angry Men', 'Year': 1957, 'Popularity': 16.5}

This function right here will group the title column and sum the popularity of every occurrence.

In [18]:
def score_titulo(title: str):
    title = string_transformation(title)
    df_movies["transformed_title"] = [string_transformation(x) for x in df_movies["title"]]
    df_grouped = df_movies.groupby("transformed_title")["popularity"].sum()
    if title in df_grouped.index: # values of the grouped column are the new index in a grouped df
        normal_index = (df_movies["transformed_title"] == title).idxmax() # index for non transformed and non grouped values
        return {
                "Title": df_movies["title"][normal_index], 
                "Year": df_movies["release_year"][normal_index].item(), 
                "Popularity" : df_grouped[title].round(2).item()
                } 
    else:
        return "Entered value is not valid."


In [19]:
score_titulo("12angry MEN")

{'Title': '12 Angry Men', 'Year': 1957, 'Popularity': 20.18}

## 4. Getting total votes and average vote of a movie.

Here we have to deal with same situation of the previous function. This is the solution usig idxmax().

In [21]:
def votos_titulo(title: str):
    title = string_transformation(title)
    df_movies["transformed_title"] = [string_transformation(x) for x in df_movies["title"]]
    if title in df_movies["transformed_title"].unique(): # there are repeated movies (different language) 
        index = (df_movies["transformed_title"] == title).idxmax()
        if df_movies["vote_count"][index] >= 2000:
            return {
                        "Title": df_movies["title"][index], 
                        "Year": df_movies["release_year"][index].item(), 
                        "Total votes": df_movies["vote_count"][index].item(), 
                        "Average vote": df_movies["vote_average"][index].item()
                        }
        else:
            return "Movie must have at least 2000 votes"
    else:
        return "Entered value is not valid."

In [22]:
votos_titulo("12anGRy- mén ")

{'Title': '12 Angry Men',
 'Year': 1957,
 'Total votes': 2130.0,
 'Average vote': 8.2}

In [18]:
votos_titulo("moonlight")

'Movie must have at least 2000 votes'

In [19]:
df_movies[df_movies["title"] == "Moonlight"] # checking the vote_count

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,tagline,title,...,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list,release_year,return,transformed_title,release_month,release_day
39877,4000000.0,376867,en,"The tender, heartbreaking story of a young man...",14.171675,2016-10-21,65046687.0,111.0,This is the story of a lifetime,Moonlight,...,,['Drama'],['en'],"['Plan B Entertainment', 'Upload Films', 'A24'...",['US'],2016,16.26,moonlight,octubre,viernes


And this is the solution considering all the occurrences and using aggregation functions.

In [23]:
def votos_titulo(title: str):
    title = string_transformation(title)
    df_movies["transformed_title"] = [string_transformation(x) for x in df_movies["title"]]
    df_grouped_total = df_movies.groupby("transformed_title")["vote_count"].sum()
    df_grouped_average = df_movies.groupby("transformed_title")["vote_average"].mean()
    if title in df_grouped_average.index: # values of the grouped column are the new index in a grouped df
        if df_grouped_total[title] >= 2000:
            normal_index = (df_movies["transformed_title"] == title).idxmax() # index for non transformed and non grouped values
            return {
                    "Title": df_movies["title"][normal_index], 
                    "Year": df_movies["release_year"][normal_index].item(), 
                    "Total_Votes" : df_grouped_total[title].item(), 
                    "Average_Vote" : df_grouped_average[title].item()
                    }
        else:
            return "Movie must have at least 2000 votes"
    else:
        return "Entered value is not valid."    


In [24]:
votos_titulo("moonlight")

'Movie must have at least 2000 votes'

In [25]:
votos_titulo("12 angry, men")

{'Title': '12 Angry Men',
 'Year': 1957,
 'Total Votes': 2189.0,
 'Average Vote': 7.85}

## 5. Getting return and number of movies of an actor.

In [27]:
def get_actor(name):
    name = string_transformation(name)
    actor_financial["transformed_name"] = [string_transformation(x) for x in actor_financial["name"]]
    if name in actor_financial["transformed_name"].unique():
        index = (actor_financial["transformed_name"] == name).idxmax()
        return {
                'actor':actor_financial["name"][index], 
                'films':actor_financial["films"][index].item(), 
                'total_return':actor_financial["total_return"][index].round(2).item(), 
                'average_return':actor_financial["average_return"][index].item()
                }        
    else:
        return "Entered value is not valid."

In [28]:
get_actor("Johnnydépp")

{'actor': 'Johnny Depp',
 'films': 69,
 'total_return': 2.77,
 'average_return': 2.02}

## 6. Getting return of a director and information about them movies.

Using Movies LEFT JOIN Directors.

In [38]:
def get_director(name):
    name = string_transformation(name)
    director_financial["transformed_name"] = [string_transformation(x) for x in director_financial["name"]]
    if name in director_financial["transformed_name"].unique():
        index = (director_financial["transformed_name"] == name).idxmax() # obtain index corresponding the name
        df_director = df_crew[df_crew["job"] == "Director"] # filter jobs
        df_movies_director = df_movies.merge(df_director[["name", "id"]], how="left", on ="id") # join movies df with directors name
        df_movies_director.fillna("", inplace=True) # replace nan with blank spaces

        ## transforming the directors name so it can match with the input
        df_movies_director["transformed_director"] = [string_transformation(x) for x in df_movies_director["name"]] # with pandas series, we need to use comprehension lists or apply()

        ## filter the movies that matches the input name with the transformed name
        movies = df_movies_director[df_movies_director["transformed_director"] == name]
        movies = movies[["title","release_year", "return", "budget", "revenue"]]
        return {
                'director':director_financial["name"][index], 
                'total return':director_financial["total_return"][index].round(2).item(), 
                'films': movies.to_dict(orient='records') # orient=records doesn't show id and objects type
                }
    else:
        return "Entered value is not valid."


In [39]:
get_director("Ste venSpielberg.. ")

{'director': 'Steven Spielberg',
 'total return': 5.33,
 'films': [{'title': 'Jurassic Park',
   'release_year': 1993,
   'return': 14.6,
   'budget': 63000000.0,
   'revenue': 920100000.0},
  {'title': "Schindler's List",
   'release_year': 1993,
   'return': 14.61,
   'budget': 22000000.0,
   'revenue': 321365567.0},
  {'title': 'E.T. the Extra-Terrestrial',
   'release_year': 1982,
   'return': 75.52,
   'budget': 10500000.0,
   'revenue': 792965326.0},
  {'title': 'Raiders of the Lost Ark',
   'release_year': 1981,
   'return': 21.66,
   'budget': 18000000.0,
   'revenue': 389925971.0},
  {'title': 'Indiana Jones and the Last Crusade',
   'release_year': 1989,
   'return': 9.88,
   'budget': 48000000.0,
   'revenue': 474171806.0},
  {'title': 'Jaws',
   'release_year': 1975,
   'return': 67.24,
   'budget': 7000000.0,
   'revenue': 470654000.0},
  {'title': 'The Lost World: Jurassic Park',
   'release_year': 1997,
   'return': 3.14,
   'budget': 73000000.0,
   'revenue': 229074524.

In [31]:
df_movies[df_movies["title"] == "E.T. the Extra-Terrestrial"]

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,tagline,title,...,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list,release_year,return,transformed_title,release_month,release_day
1063,10500000.0,601,en,After a gentle alien becomes stranded on Earth...,19.358546,1982-04-03,792965326.0,115.0,He is afraid. He is alone. He is three million...,E.T. the Extra-Terrestrial,...,,"['Science Fiction', 'Adventure', 'Family', 'Fa...",['en'],"['Universal Pictures', 'Amblin Entertainment']",['US'],1982,75.52,ettheextraterrestrial,abril,sábado


Using Directors LEFT JOIN Movies.

In [42]:
def get_director(name):
    name = string_transformation(name)
    director_financial["transformed_name"] = [string_transformation(x) for x in director_financial["name"]]
    if name in director_financial["transformed_name"].unique():
        index = (director_financial["transformed_name"] == name).idxmax() # obtain index corresponding the name
        df_director = df_crew[df_crew["job"] == "Director"] # filter jobs
        df_movies_director = df_director[["name", "id"]].merge(df_movies, how="left", on ="id") # join movies df with directors name
        df_movies_director.fillna("", inplace=True) # replace nan with blank spaces

        ## transforming the directors name so it can match with the input
        df_movies_director["transformed_director"] = [string_transformation(x) for x in df_movies_director["name"]] # with pandas series, we need to use comprehension lists or apply()

        ## filter the movies that matches the input name with the transformed name
        movies = df_movies_director[df_movies_director["transformed_director"] == name]
        movies = movies[["title","release_year", "return", "budget", "revenue"]]
        movies["release_year"] = [int(x) for x in movies["release_year"]] #change years like 1990.0 to 1990
        return {
                'director':director_financial["name"][index], 
                'total return':director_financial["total_return"][index].round(2).item(), 
                'films': movies.to_dict(orient='records') # orient=records doesn't show id and objects type
                }
    else:
        return "Entered value is not valid."

In [43]:
get_director("Ste venSpielberg.. ")

{'director': 'Steven Spielberg',
 'total return': 5.33,
 'films': [{'title': 'Jurassic Park',
   'release_year': 1993,
   'return': 14.6,
   'budget': 63000000.0,
   'revenue': 920100000.0},
  {'title': "Schindler's List",
   'release_year': 1993,
   'return': 14.61,
   'budget': 22000000.0,
   'revenue': 321365567.0},
  {'title': 'E.T. the Extra-Terrestrial',
   'release_year': 1982,
   'return': 75.52,
   'budget': 10500000.0,
   'revenue': 792965326.0},
  {'title': 'Raiders of the Lost Ark',
   'release_year': 1981,
   'return': 21.66,
   'budget': 18000000.0,
   'revenue': 389925971.0},
  {'title': 'Indiana Jones and the Last Crusade',
   'release_year': 1989,
   'return': 9.88,
   'budget': 48000000.0,
   'revenue': 474171806.0},
  {'title': 'Jaws',
   'release_year': 1975,
   'return': 67.24,
   'budget': 7000000.0,
   'revenue': 470654000.0},
  {'title': 'The Lost World: Jurassic Park',
   'release_year': 1997,
   'return': 3.14,
   'budget': 73000000.0,
   'revenue': 229074524.