In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from unidecode import unidecode
#import scipy.sparse as sp this is a leftover from one of the attempts at adjusting the tfidf values, too complicated, so left it alone, but wanted to keep it as reminder


In [3]:
df = pd.read_csv('dataset/clean_dataset.csv', encoding="utf-8")

The main idea for this part was to use the spanish locale to call the days, but getting fastapi and render to work in spanish was impossible.

@app.get('/peliculas_mes/{mes}')
def peliculas_mes(mes:str):
    '''Se ingresa el mes y la funcion retorna la cantidad de peliculas que se estrenaron ese mes historicamente'''

turning release_date into date time so we can put the month in a separate column.
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
getting the months and set them in a column
    df['month'] = df['release_date'].dt.month_name(locale='Spanish')

filter by specified month
    df_mes = df[df['month'].str.lower() == mes.lower()]
get amount of movies for the month.
    cantidad = len(df_mes)
returning as dictionary.
    return {'mes': mes.capitalize(), 'cantidad': cantidad}


In [4]:
#@app.get('/peliculas_mes/{mes}')
def peliculas_mes(mes: str):
    '''get all the movies produced on certain months'''

    #dictionary to map english months to spanish.
    meses_ingles = {
        'January': 'enero',
        'February': 'febrero',
        'March': 'marzo',
        'April': 'abril',
        'May': 'mayo',
        'June': 'junio',
        'July': 'julio',
        'August': 'agosto',
        'September': 'septiembre',
        'October': 'octubre',
        'November': 'noviembre',
        'December': 'diciembre'
    }
    
    #mapping the dictionary
    meses_espanol = {v: k for k, v in meses_ingles.items()}
    
    #converting the column to datetime because for some reason sometimes it breaks
    df['release_date'] = pd.to_datetime(df['release_date'])

    #get month name and force it to lower case
    df['mes'] = df['release_date'].dt.month_name().str.lower()

    #return error if not using a month in spanish
    if mes.lower() not in meses_espanol:
        return {'error': 'Invalid month, enter a valid month in spanish'}
    
    #getting the spanish month, forced to lower case as well
    #filter month to the english month.
    df_mes = df[df['mes'] == meses_espanol[mes.lower()].lower()]
    nombre_mes = mes.capitalize()

    #get amount of movies for the month
    cantidad = len(df_mes)

    #return as dictionary
    return {'mes': nombre_mes, 'cantidad': cantidad}

In [5]:
peliculas_mes('enero')

{'mes': 'Enero', 'cantidad': 5912}

In [6]:
peliculas_mes('january')

{'error': 'Invalid month, enter a valid month in spanish'}

Same issue as the months, since getting fastapi and render to work in spanish properly wasn't possible, this code will be left for future reference.

@app.get('/peliculas_dis/{dis}')
def peliculas_dia(dia:str):
    '''Se ingresa el dia y la funcion retorna la cantidad de peliculas que se estrenaron ese dia historicamente'''

turning release_date into date time so we can put the year in a separate column.
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

getting the day for the day of week column
    df['day_of_week'] = df['release_date'].dt.day_name(locale='Spanish')

filtering to find the day
    df_dia = df[df['day_of_week'].apply(lambda x: unidecode(x).lower()) == unidecode(dia.lower())]

getting the amount of movies released on that day
    cantidad_peliculas = len(df_dia)

returning it as a dictionary.
    return {'dia_semana': dia.capitalize(), 'cantidad': cantidad_peliculas}

In [7]:
#@app.get('/peliculas_dia/{dia}')
def peliculas_dia(dia: str):
    '''get all the movies produced on a certain day'''
    
    #dictionary that maps the english days to their spanish counterparts
    dias_ingles = {
        'Monday': 'lunes',
        'Tuesday': 'martes',
        'Wednesday': 'miércoles',
        'Thursday': 'jueves',
        'Friday': 'viernes',
        'Saturday': 'sábado',
        'Sunday': 'domingo'
    }
    
    #mapping the dictionary
    dias_espanol = {unidecode(v): k for k, v in dias_ingles.items()}
        
    #converting the column to date time, just in case, again, same deal as with month
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    #getting the day of the week from the date
    df['day_of_week'] = df['release_date'].dt.day_name()

    #return error if input is not a day in spanish
    if unidecode(dia.lower()) not in dias_espanol:
        return {'error': 'Invalid day, enter a valid spanish day. Accents can be ignored'}

    # getting the Spanish day, forced to lower case and using unidecode to ignore accents
    df_dia = df[df['day_of_week'] == dias_espanol[unidecode(dia.lower())]]
    nombre_dia = dia.capitalize()



    #getting the amount of movies released on that day
    cantidad = len(df_dia)

    #returning it as a dictionary.
    return {'dia_semana': nombre_dia, 'cantidad': cantidad}


In [8]:
peliculas_dia('sabado')

{'dia_semana': 'Sabado', 'cantidad': 5151}

In [9]:
peliculas_dia('saturday')

{'error': 'Invalid day, enter a valid spanish day. Accents can be ignored'}

In [10]:
#@app.get('/franquicia/{franquicia}')
def franquicia(franquicia:str):
    '''get the average and total earnings from a certain collection'''

    #filtering movies that belong to a collection the case and na exceptions are made to ignore the case from the input and avoid raising errors if there are empty values
    movie_collection = df[df['belongs_to_collection'].str.contains(franquicia, case=False, na=False)]

    if movie_collection.empty:
        return {'error': 'Movie collection not found'}

    #getting the amount of movies in that collection
    mov_quant= len(movie_collection)

    #adding together the total earnings from the movies, and getting the average
    total_earnings = round(movie_collection['revenue'].sum(),2)
    avg_earnings = round(movie_collection['revenue'].mean(),2)

    #returning as dictioanry
    return {'franquicia': franquicia, 'cantidad': mov_quant, 'ganancia_total': f'{total_earnings:,}', 'ganancia_promedio': f'{avg_earnings:,}'}


In [11]:
franquicia('shrek')

{'franquicia': 'shrek',
 'cantidad': 5,
 'ganancia_total': '2,955,807,008.0',
 'ganancia_promedio': '591,161,401.6'}

In [12]:
franquicia('aasdf')

{'error': 'Movie collection not found'}

In [13]:
#@app.get('/peliculas_pais/{pais}')
def peliculas_pais(pais:str):
    '''get the amount of movies produced in a certain country'''
    
    #filtering movies by country
    countries = df[df['production_countries'].str.contains(pais, case=False, na=False)]

    if countries.empty:
        return {'error': 'Country not found'}

    #getting total movies produced within the country
    quant = len(countries)

    #return as dictionary
    return {'pais': pais.capitalize(), 'cantidad': quant}

In [14]:
peliculas_pais('japan')

{'pais': 'Japan', 'cantidad': 1648}

In [15]:
peliculas_pais('no country')

{'error': 'Country not found'}

In [16]:
#@app.get('/productoras/{productora}')
def productoras(productora:str):
    '''get the amount of movies produced by a certain company'''

    #filtering the_production to include only rows where the productora is present in the production_companies column
    filtered_production =df[df['production_companies'].str.contains(productora, case=False, na=False)]

    if filtered_production.empty:
        return {'error': 'Production Company not found'}

    #calculating the total revenue and count the number of movies produced by the productora
    total_earnings = filtered_production['revenue'].sum()
    quant = filtered_production.shape[0]  #number of rows in the filtered_production

    #return as dictionary
    return {'productora': productora.title(), 'ganancia_total': f'{total_earnings:,}', 'cantidad': quant}

In [17]:
productoras('universal')

{'productora': 'Universal',
 'ganancia_total': '58,548,254,273.0',
 'cantidad': 1103}

In [18]:
productoras('test me')

{'error': 'Production Company not found'}

In [19]:
#@app.get('/retorno/{pelicula}')
def retorno(pelicula:str):
    '''get the investment, earnings and return on a certain movie'''

    #filtering df by specified movie
    filtered_movie = df[df['title'].str.contains(pelicula, case=False, na=False)]

    #if no movie is found, returns an error telling you that the movie isn't on the df
    if filtered_movie.empty:
        return {'error': 'Movie not found'}

    #just calling the correct columns for the filter.
    investment = filtered_movie['budget'].values[0]
    earnings = filtered_movie['revenue'].values[0]
    roi = round(filtered_movie['return'].values[0],2) #unclear if what was asked was ganancia - inversion or just to grab the return column, which is the more intuitive answer. 
    year = int(filtered_movie['release_year'].values[0])

    #return as dictionary
    return {'pelicula': pelicula.title(), 'inversion': f'{investment:,}', 'ganancia': f'{earnings:,}', 'retorno': f'{roi:,}', 'anio': year}


In [20]:
retorno('shrek')

{'pelicula': 'Shrek',
 'inversion': '60,000,000.0',
 'ganancia': '484,409,218.0',
 'retorno': '8.07',
 'anio': 2001}

In [21]:
retorno('i hate this')

{'error': 'Movie not found'}

In [22]:
#filling the df with empty spaces to avoid errors, preprocessing the text, and then making the tfidf matrix for the recommendation function.

df.fillna({'overview': '', 'tagline': '', 'genres': '', 'belongs_to_collection': ''}, inplace=True)


def preprocess_text(text):
    '''this function grabs any text, turns it to lower case, removes punctuation and turns numbers to strings'''
    #lowercasing the text
    text = text.lower()
    #removing punctuation
    text = ''.join(c for c in text if c.isalnum() or c.isspace())

    return text

#preprocessing the overview, tagline, and genres columns and dropping the preprocessed column
df['preprocessed_text'] = df['overview'] + ' ' + df['tagline'] + ' ' + df['genres']  + ' ' + df['title'] + ' ' + df['belongs_to_collection']
df['processed_text'] = df['preprocessed_text'].map(preprocess_text)
df = df.drop(columns=['preprocessed_text'])

#calculate ifidf matrix. using TfidfVectorizer's list of stop words lets you skip some annoying processes of cleaning the text data.
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['processed_text'])


i have gripes with this part of the code.

df['preprocessed_text'] = df['overview'] + ' ' + df['tagline'] + ' ' + df['genres'] + ' ' + df['title']  + ' ' + df['belongs_to_collection']

overview, tagline, and genres are good enough to get some movies that are close to what should be expected as result. however, it's still not as precise as i would like it to be.
adding title seems to have fixed a lot of others, but still gives random movies when checking some collections.
sinces that was the case, i decided to add the belongs_to_collection to the list, and that seems to have corrected it for the most part.

however i still don't understand why avengers or thor give completely unrelated results when the results should have been pretty intuitive.
this shows that clearly the data available is just not enough to make a reliable recommendation algorithm. if we had the cast or some other additional data, maybe we could make a more precise word soup to feed into the matrix and make it better.

also, tfidf_matrix is just the raw matrix df, with a single dimension, but a ton of data.
i tried limiting the data in that matrix to 30000 words, then passing it through a cosine similarity function, transforming it to a multidimensional one with only the 3000 highest data points to create the model.
the problem is the data quality of what we had available: when it passes through the recomendacion function, it gives garbage recommendations.
so instead, by doing the cosine similarities with the tf-idf matrix of the requested movie, and the unfiltered tfidf_matrix, the results ended up being a lot more accurate.
however, there are still some very questionable cases in the movies that are returned.
for example thor and avengers as search terms dont return avenger related movies, but searching captain america, ultron, and some others does bring up movies that could be very related to them.
my theory is that avengers might be too short of a string to properly give a recommendation, considering the amount of movies we have with that name, and thor could be uncommon enough to not give a good result regardless.
finally, thanks to the research i did for all of this, i learned that you should probably drop the python kernel completely if you want to restart the matrix generation.
this is because because at some point during all the testing, i started getting really good results, but after restarting vscode a few hours later, the results were back to the usual less accurate ones.
i have no idea what, of all the things i did, affected those results so much during testing, but it was a fun learning experience.

In [23]:
# ML
#@app.get('/recomendacion/{titulo}')
def recomendacion(titulo: str):
    '''get recommended movies based on a certain movie. uses tfidf matrix to compare vectors from the dataset and the selected movie to generate recommendations'''
    movie = df[df['title'].str.contains(titulo, case=False, na=False)]

    if movie.empty:
        return {'error': 'Movie not found'}

    movie_index = movie.index[0]
    movie_vector = tfidf_matrix[movie_index]

    #calculating cosine similarity between the input movie and the preloaded tfidf matrix
    cosine_similarities = linear_kernel(movie_vector, tfidf_matrix).flatten()

    #getting the indices of movies sorted by similarity scores
    similar_movie_indices = cosine_similarities.argsort()[::-1]

    #filtering out the input movie itself
    similar_movie_indices = similar_movie_indices[similar_movie_indices != movie_index]

    #getting the top 5 recommendations
    similar_movie_indices = similar_movie_indices[:5]
    recommended_movies = list(df['title'].iloc[similar_movie_indices].str.title())

    #returning the recommended movies as dictionary
    return {'Lista recomendada': recommended_movies}

In [24]:
recomendacion('lord of the rings')

{'Lista recomendada': ['The Return Of The King',
  'The Lord Of The Rings: The Fellowship Of The Ring',
  'The Lord Of The Rings: The Return Of The King',
  'The Lord Of The Rings: The Two Towers',
  'The Ring Thing']}

In [25]:
recomendacion('jumanji')

{'Lista recomendada': ['Game Over',
  'Table No. 21',
  'Liar Game: Reborn',
  'Big Game',
  'Pixels']}

In [26]:
recomendacion('minions')

{'Lista recomendada': ['Despicable Me 2',
  'Minions: Orientation Day',
  'Mower Minions',
  'Banana',
  'Despicable Me 3']}

In [27]:
recomendacion('batman')

{'Lista recomendada': ['Batman: The Dark Knight Returns, Part 1',
  'Batman: The Dark Knight Returns, Part 2',
  'Batman: Bad Blood',
  'Batman: Mask Of The Phantasm',
  'The Dark Knight']}

In [28]:
recomendacion('harry potter')

{'Lista recomendada': ['Harry Potter And The Chamber Of Secrets',
  'Harry Potter And The Prisoner Of Azkaban',
  'Harry Potter And The Order Of The Phoenix',
  'Harry Potter And The Goblet Of Fire',
  'Harry Potter And The Deathly Hallows: Part 2']}

In [29]:
recomendacion('toy story')

{'Lista recomendada': ['Toy Story 2',
  'Toy Story 3',
  'Small Fry',
  'Toy Story Of Terror!',
  'Toy Reanimator']}

In [30]:
recomendacion('pulp fiction')

{'Lista recomendada': ['A Crime',
  'From Mexico With Love',
  'The Fortunes And Misfortunes Of Moll Flanders',
  'Moll Flanders',
  'All At Once']}

In [31]:
recomendacion('the dictator')

{'Lista recomendada': ['Tirano Banderas',
  'Moon Over Parador',
  'The Great Dictator',
  'The President',
  "Ratko: The Dictator'S Son"]}

In [32]:
recomendacion('star wars')

{'Lista recomendada': ['The Empire Strikes Back',
  'The Star Wars Holiday Special',
  'Star Wars: The Force Awakens',
  'Return Of The Jedi',
  'Empire Of Dreams: The Story Of The Star Wars Trilogy']}

In [33]:
recomendacion('avengers')

{'Lista recomendada': ['The Work And The Glory',
  "Sir Arne'S Treasure",
  'The Journey Of August King',
  'Our Beloved Month Of August',
  'Doctor In Distress']}

In [34]:
recomendacion('thor')

{'Lista recomendada': ['The Christmas Candle',
  '8 Days To Premiere',
  'Down With Love',
  'Oh, Hello: On Broadway',
  'Opening Night']}

In [35]:
recomendacion('ultron')

{'Lista recomendada': ['The Avengers',
  'Next Avengers: Heroes Of Tomorrow',
  'Ultimate Avengers',
  'Captain America: Civil War',
  'Ultimate Avengers 2']}

In [36]:
recomendacion('captain america')

{'Lista recomendada': ['Captain America: The First Avenger',
  'Iron Man & Captain America: Heroes United',
  'Captain America',
  'Captain America: The Winter Soldier',
  'Captain America: Civil War']}

In [38]:
recomendacion('spiderman')

{'Lista recomendada': ['Spider-Man 3',
  'Spider-Man 2',
  'Spider-Man',
  'The Amazing Spider-Man 2',
  'The Amazing Spider-Man']}