In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from unidecode import unidecode
#import scipy.sparse as sp this is a leftover from one of the attempts at adjusting the tfidf values, too complicated, so left it alone


In [5]:
df = pd.read_csv('dataset/clean_dataset_list_version.csv', encoding="utf-8")

The main idea for this part was to use the spanish locale to call the days, but getting fastapi and render to work in spanish was impossible.

#@app.get('/peliculas_mes/{mes}')
def peliculas_mes(mes:str):
    '''Se ingresa el mes y la funcion retorna la cantidad de peliculas que se estrenaron ese mes historicamente'''

#turning release_date into date time so we can put the month in a separate column.
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
#getting the months and set them in a column
    df['month'] = df['release_date'].dt.month_name(locale='Spanish')

#filter by specified month
    df_mes = df[df['month'].str.lower() == mes.lower()]
#get amount of movies for the month.
    cantidad = len(df_mes)
#returning as dictionary.
    return {'mes': mes.capitalize(), 'cantidad': cantidad}


In [6]:
#@app.get('/peliculas_mes/{mes}')
def peliculas_mes(mes: str):
    '''get all the movies produced on certain months'''

    #dictionary to map english months to spanish.
    meses_ingles = {
        'January': 'enero',
        'February': 'febrero',
        'March': 'marzo',
        'April': 'abril',
        'May': 'mayo',
        'June': 'junio',
        'July': 'julio',
        'August': 'agosto',
        'September': 'septiembre',
        'October': 'octubre',
        'November': 'noviembre',
        'December': 'diciembre'
    }
    
    #mapping the dictionary
    meses_espanol = {v: k for k, v in meses_ingles.items()}
    
    #converting the column to datetime because for some reason sometimes it breaks
    df['release_date'] = pd.to_datetime(df['release_date'])

    #get month name and force it to lower case
    df['mes'] = df['release_date'].dt.month_name().str.lower()
    
    #getting the spanish month, forced to lower case as well
    #filter month to the english month.
    df_mes = df[df['mes'] == meses_espanol[mes.lower()].lower()]
    nombre_mes = mes.capitalize()

    #get amount of movies for the month
    cantidad = len(df_mes)

    #return as dictionary
    return {'mes': nombre_mes, 'cantidad': cantidad}

In [7]:
peliculas_mes('enero')

{'mes': 'Enero', 'cantidad': 5912}

Same issue as the months, since getting fastapi and render to work in spanish properly wasn't possible, this code will be left for future reference.

#@app.get('/peliculas_dis/{dis}')
def peliculas_dia(dia:str):
    '''Se ingresa el dia y la funcion retorna la cantidad de peliculas que se estrenaron ese dia historicamente'''

#turning release_date into date time so we can put the year in a separate column.
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

#getting the day for the day of week column
    df['day_of_week'] = df['release_date'].dt.day_name(locale='Spanish')

#filtering to find the day
    df_dia = df[df['day_of_week'].apply(lambda x: unidecode(x).lower()) == unidecode(dia.lower())]

#getting the amount of movies released on that day
    cantidad_peliculas = len(df_dia)

#returning it as a dictionary.
    return {'dia_semana': dia.capitalize(), 'cantidad': cantidad_peliculas}

In [8]:
#@app.get('/peliculas_dia/{dia}')
def peliculas_dia(dia: str):
    '''get all the movies produced on a certain day'''
    
    #dictionary that maps the english days to their spanish counterparts
    dias_ingles = {
        'Monday': 'lunes',
        'Tuesday': 'martes',
        'Wednesday': 'miércoles',
        'Thursday': 'jueves',
        'Friday': 'viernes',
        'Saturday': 'sábado',
        'Sunday': 'domingo'
    }
    
    #mapping the dictionary
    dias_espanol = {unidecode(v): k for k, v in dias_ingles.items()}
        
    #converting the column to date time, just in case, again, same deal as with month
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    #getting the day of the week from the date
    df['day_of_week'] = df['release_date'].dt.day_name()

    # getting the Spanish day, forced to lower case and using unidecode to ignore accents
    df_dia = df[df['day_of_week'] == dias_espanol[unidecode(dia.lower())]]
    nombre_dia = dia.capitalize()

    #getting the amount of movies released on that day
    cantidad = len(df_dia)

    #returning it as a dictionary.
    return {'dia_semana': nombre_dia, 'cantidad': cantidad}


In [9]:
peliculas_dia('miercoles')

{'dia_semana': 'Miercoles', 'cantidad': 7035}

In [10]:
#@app.get('/franquicia/{franquicia}')
def franquicia(franquicia:str):
    '''get the average and total earnings from a certain collection'''

    #filtering movies that belong to a collection the case and na exceptions are made to ignore the case from the input and avoid raising errors if there are empty values
    movie_collection = df[df['belongs_to_collection'].str.contains(franquicia, case=False, na=False)]

    #getting the amount of movies in that collection
    mov_quant= len(movie_collection)

    #adding together the total earnings from the movies, and also getting the average
    total_earnings = round(movie_collection['revenue'].sum(),2)
    avg_earnings = round(movie_collection['revenue'].mean(),2)

    #returning as dictioanry
    return {'franquicia': franquicia, 'cantidad': mov_quant, 'ganancia_total': f'{total_earnings:,}', 'ganancia_promedio': f'{avg_earnings:,}'}


In [11]:
franquicia('chili')

{'franquicia': 'chili',
 'cantidad': 2,
 'ganancia_total': '210,327,738.0',
 'ganancia_promedio': '105,163,869.0'}

In [12]:
#@app.get('/peliculas_pais/{pais}')
def peliculas_pais(pais:str):
    '''get the amount of movies produced in a certain country'''
    
    #filtering movies by country
    countries = df[df['production_countries'].str.contains(pais, case=False, na=False)]

    #getting total movies produced within the country
    quant = len(countries)

    #return as dictionary
    return {'pais': pais.capitalize(), 'cantidad': quant}

In [13]:
peliculas_pais('germany')

{'pais': 'Germany', 'cantidad': 2260}

In [14]:
#@app.get('/productoras/{productora}')
def productoras(productora:str):
    '''get the amount of movies produced by a certain company'''

    #filtering the_production to include only rows where the productora is present in the production_companies column
    filtered_production =df[df['production_companies'].str.contains(productora, case=False, na=False)]

    #calculating the total revenue and count the number of movies produced by the productora
    total_earnings = filtered_production['revenue'].sum()
    quant = filtered_production.shape[0]  #number of rows in the filtered_production

    #return as dictionary
    return {'productora': productora.title(), 'ganancia_total': f'{total_earnings:,}', 'cantidad': quant}

In [15]:
productoras('universal')

{'productora': 'Universal',
 'ganancia_total': '58,548,254,273.0',
 'cantidad': 1103}

In [16]:
#@app.get('/retorno/{pelicula}')
def retorno(pelicula:str):
    '''get the investment, earnings and return on a certain movie'''

    #filtering df by specified movie
    filtered_movie = df[df['title'].str.contains(pelicula, case=False, na=False)]

    #just calling the correct columns for the filter.
    investment = filtered_movie['budget'].values[0]
    earnings = filtered_movie['revenue'].values[0]
    roi = round(filtered_movie['return'].values[0],2) #unclear if what was asked was ganancia - inversion or just to grab the return column, which is the more intuitive answer. 
    year = int(filtered_movie['release_year'].values[0])

    #return as dictionary
    return {'pelicula': pelicula.title(), 'inversion': f'{investment:,}', 'ganancia': f'{earnings:,}', 'retorno': f'{roi:,}', 'anio': year}


In [17]:
retorno('shrek')

{'pelicula': 'Shrek',
 'inversion': '60,000,000.0',
 'ganancia': '484,409,218.0',
 'retorno': '8.07',
 'anio': 2001}

In [18]:
#filling the df with empty spaces to avoid errors, preprocessing the text, and then making the tfidf matrix for the recommendation function.

df.fillna({'overview': '', 'tagline': '', 'genres': '', 'belongs_to_collection': ''}, inplace=True)


def preprocess_text(text):
    #lowercasing the text
    text = text.lower()
    #removing punctuation
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    return text

#preprocessing the overview, tagline, and genres columns and dropping the preprocessed column
df['preprocessed_text'] = df['overview'] + ' ' + df['tagline'] + ' ' + df['genres']
df['processed_text'] = df['preprocessed_text'].map(preprocess_text)
df = df.drop(columns=['preprocessed_text'])

#calculate ifidf matrix. using TfidfVectorizer's list of stop words lets you skip some annoying processes of cleaning the text data. also makes it not eat all the ram
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['processed_text'])


In [19]:
# ML
#@app.get('/recomendacion/{titulo}')
def recomendacion(titulo: str):
    '''get recommended movies based on a certain movie. uses tfidf matrix to compare vectors from the dataset and the selected movie to generate recommendations'''
    movie = df[df['title'].str.contains(titulo, case=False, na=False)]

    if movie.empty:
        return {'error': 'Movie not found'}

    movie_index = movie.index[0]
    movie_vector = tfidf_matrix[movie_index]

    #calculating cosine similarity between the input movie and all other movies
    cosine_similarities = linear_kernel(movie_vector, tfidf_matrix).flatten()

    #getting the indices of movies sorted by similarity scores
    similar_movie_indices = cosine_similarities.argsort()[::-1]

    #filtering out the input movie itself
    similar_movie_indices = similar_movie_indices[similar_movie_indices != movie_index]

    #getting the top 5 recommendations
    similar_movie_indices = similar_movie_indices[:5]
    recommended_movies = list(df['title'].iloc[similar_movie_indices].str.title())

    #returning the recommended movies as dictionary
    return {'Lista recomendada': recommended_movies}

In [24]:
recomendacion('lord of the rings')

{'Lista recomendada': ['The Return Of The King',
  'The Lord Of The Rings: The Fellowship Of The Ring',
  'The Lord Of The Rings: The Return Of The King',
  'A Fighting Man',
  'The Hunt For Gollum']}