# Importing Labraries

In [2]:
import pandas as pd
from unidecode import unidecode
import re
import locale
locale.setlocale(locale.LC_TIME, 'es_ES') # setting a local configuration for dates

'es_ES'

# Functions

In this notebook, we are going to build some functions to test the API.

## Amount of films by month

In [92]:
def cantidad_filmaciones_mes(month: str):
    if type(month) == str:
        month = month.lower().strip().replace(" ", "")
        month = unidecode(month)  # delete accents
        month = re.sub(r'[^\w\s]', '', month)  # delete special characters and punctuation marks
        df_movies = pd.read_csv("../processed_data/movies.csv")
        # '%B' complete name of the month, strftime is string format time, it allows to format date data to a desirable representation
        df_movies["release_month"] = pd.to_datetime(df_movies["release_date"]).dt.strftime('%B')
        if month in df_movies["release_month"].unique():
            count_by_month = df_movies.groupby(["release_month"])["title"].count()
            return {month: count_by_month[month]}
        else:
            print("El valor ingresado no es un mes válido.")
    else:
        print("Por favor, introduzca el nombre del mes.")


In [93]:
cantidad_filmaciones_mes("+áBril")

{'abril': 3453}

In [15]:
df_movies = pd.read_csv("../processed_data/movies.csv")

In [7]:
df_movies["release_month"] = pd.to_datetime(df_movies["release_date"]).dt.strftime('%B') 
count_by_month = df_movies.groupby(["release_month"])["title"].count()
count_by_month

release_month
abril         3453
agosto        3394
diciembre     3784
enero         5910
febrero       3029
julio         2640
junio         3152
marzo         3553
mayo          3338
noviembre     3661
octubre       4613
septiembre    4836
Name: title, dtype: int64

## Amount of films by day of the week.

In [94]:
def cantidad_filmaciones_dia(day: str):
    if type(day) == str:
        day = day.lower().strip().replace(" ", "")
        day = unidecode(day) # delete accents
        day = re.sub(r'[^\w\s]', '', day) # delete special characters and punctuation marks
        df_movies = pd.read_csv("../processed_data/movies.csv")
        # '%A' complete name of the day, strftime is string format time, it allows to format date data to a desirable representation
        df_movies["release_day"] = pd.to_datetime(df_movies["release_date"]).dt.strftime('%A') 
        if day in df_movies["release_day"].unique():
            count_by_month = df_movies.groupby(["release_day"])["title"].count()
            return {day:count_by_month[day]}
        else:
            print("El valor ingresado no es un dia válido.")
    else:
        print("Por favor, introduzca el nombre del dia de la semana.")
    

In [95]:
cantidad_filmaciones_dia("Viérnes-")

{'viernes': 13906}

## Getting the popularity of a movie introducing the title.

There is a situation with the movies that have to be adressed here, there are duplicates for many movies because there are released in different languages. This function will get the popularity using idxmax() to get the index according to the title. idxmax() will consider only the first occurrence.

In [98]:
def score_titulo(title: str):
    if type(title) == str:
        title = title.lower().strip().replace(" ", "")
        title = unidecode(title) # delete accents
        title = re.sub(r'[^\w\s]', '', title) # delete special characters and punctuation marks
        df_movies = pd.read_csv("../processed_data/movies.csv")
        df_movies["transformed_title"] = [x.lower().strip().replace(" ", "") for x in df_movies["title"]] # with pandas series, we need to use comprehension lists or apply()
        df_movies["transformed_title"] = [unidecode(x) for x in df_movies["transformed_title"]]
        df_movies["transformed_title"] = [re.sub(r'[^\w\s]', '', x) for x in df_movies["transformed_title"]]
        if title in df_movies["transformed_title"].unique(): # there are repeated movies (different language)
            index = (df_movies["transformed_title"] == title).idxmax()
            return {"Title":df_movies["title"][index], "Year":df_movies["release_year"][index], "Popularity":df_movies["popularity"][index]} 
        else:
            print("El nombre de la pélicula no es válido.")
    else:
        print("Por favor, introduzca el titulo de la pélicula.")

In [99]:
score_titulo("12angry, MEN")

{'Title': '12 Angry Men', 'Year': 1957, 'Popularity': 16.503959}

This function right here will group the title column and sum the popularity.

In [81]:
def score_titulo(title: str):
    if type(title) == str:
        title = title.lower().strip().replace(" ", "")
        title = unidecode(title)  # delete accents
        title = re.sub(r'[^\w\s]', '', title)  # delete special characters and punctuation marks
        df_movies = pd.read_csv("../processed_data/movies.csv")
        df_movies["transformed_title"] = [x.lower().strip().replace(" ", "") for x in df_movies["title"]]  # with pandas series, we need to use comprehension lists or apply()
        df_movies["transformed_title"] = [unidecode(x) for x in df_movies["transformed_title"]]
        df_movies["transformed_title"] = [re.sub(r'[^\w\s]', '', x) for x in df_movies["transformed_title"]]       
        df_grouped = df_movies.groupby("transformed_title")["popularity"].sum()
        if title in df_grouped.index: # values of the grouped column are the new index in a grouped df
            return df_grouped[title]
        else:
            print("El nombre de la película no es válido.")
    else:
        print("Por favor, introduzca el título de la película.")


In [83]:
score_titulo("12angry MEN")

20.183221999999997

def votos_titulo( titulo_de_la_filmación ): Se ingresa el título de una filmación esperando como respuesta el título, la cantidad de votos y el valor promedio de las votaciones. La misma variable deberá de contar con al menos 2000 valoraciones, caso contrario, debemos contar con un mensaje avisando que no cumple esta condición y que por ende, no se devuelve ningun valor.
                    Ejemplo de retorno: La película X fue estrenada en el año X. La misma cuenta con un total de X valoraciones, con un promedio de X

## Getting total votes and average vote of a movie.

In [90]:
def votos_titulo(title: str):
    if type(title) == str:
        title = title.lower().strip().replace(" ", "")
        title = unidecode(title) # delete accents
        title = re.sub(r'[^\w\s]', '', title) # delete special characters and punctuation marks
        df_movies = pd.read_csv("../processed_data/movies.csv")
        df_movies["transformed_title"] = [x.lower().strip().replace(" ", "") for x in df_movies["title"]] # with pandas series, we need to use comprehension lists or apply()
        df_movies["transformed_title"] = [unidecode(x) for x in df_movies["transformed_title"]]
        df_movies["transformed_title"] = [re.sub(r'[^\w\s]', '', x) for x in df_movies["transformed_title"]]
        if title in df_movies["transformed_title"].unique(): # there are repeated movies (different language)
            index = (df_movies["transformed_title"] == title).idxmax()
            return {"Title": "dasd", "Anio": "dsd", "Total votes": df_movies["vote_count"][index], "Average vote": df_movies["vote_average"][index]}   
        else:
            print("El nombre de la pélicula no es válido.")
    else:
        print("Por favor, introduzca el titulo de la pélicula.")

In [91]:
votos_titulo("- toy stOry")

{'Title': 'dasd', 'Anio': 'dsd', 'Total votes': 5415.0, 'Average vote': 7.7}

In [87]:
df_movies.head(1)

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,status,title,vote_average,vote_count,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list,release_year,return,transformed_title
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,Toy Story,7.7,5415.0,Toy Story Collection,"['Animation', 'Comedy', 'Family']",['en'],['Pixar Animation Studios'],['US'],1995,12.451801,toystory


In [None]:
def votos_titulo(title: str):
    if type(title) == str:
        title = title.lower().strip().replace(" ", "")
        title = unidecode(title)  # delete accents
        title = re.sub(r'[^\w\s]', '', title)  # delete special characters and punctuation marks
        df_movies = pd.read_csv("../processed_data/movies.csv")
        df_movies["transformed_title"] = [x.lower().strip().replace(" ", "") for x in df_movies["title"]]  # with pandas series, we need to use comprehension lists or apply()
        df_movies["transformed_title"] = [unidecode(x) for x in df_movies["transformed_title"]]
        df_movies["transformed_title"] = [re.sub(r'[^\w\s]', '', x) for x in df_movies["transformed_title"]]
        df_grouped_total = df_movies.groupby("transformed_title")["popularity"].sum()
        df_grouped_average = df_movies.groupby("transformed_title")["popularity"].sum()
        if title in df_grouped.index: # values of the grouped column are the new index in a grouped df
            return df_grouped[title]
        else:
            print("El nombre de la película no es válido.")    


    else:
        print("Por favor, introduzca el título de la película.")

def get_actor( nombre_actor ): Se ingresa el nombre de un actor que se encuentre dentro de un dataset debiendo devolver el éxito del mismo medido a través del retorno. Además, la cantidad de películas que en las que ha participado y el promedio de retorno. La definición no deberá considerar directores.
                    Ejemplo de retorno: El actor X ha participado de X cantidad de filmaciones, el mismo ha conseguido un retorno de X con un promedio de X por filmación


def get_director( nombre_director ): Se ingresa el nombre de un director que se encuentre dentro de un dataset debiendo devolver el éxito del mismo medido a través del retorno. Además, deberá devolver el nombre de cada película con la fecha de lanzamiento, retorno individual, costo y ganancia de la misma.