# Importing labraries and data loading

In [1]:
import pandas as pd
import numpy as np
import ast
from unidecode import unidecode
import re
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [2]:
df_movies = pd.read_csv("../processed_data/movies.csv")
df_movies.head(2)

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,title,vote_average,...,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list,release_year,return,release_month,release_day,directors
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Toy Story,7.7,...,Toy Story Collection,"['Animation', 'Comedy', 'Family']",['en'],['Pixar Animation Studios'],['US'],1995,12.45,octubre,lunes,['John Lasseter']
1,65000000.0,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Jumanji,6.9,...,,"['Adventure', 'Fantasy', 'Family']","['en', 'fr']","['TriStar Pictures', 'Teitler Film', 'Intersco...",['US'],1995,4.04,diciembre,viernes,['Joe Johnston']


In [3]:
# the type of these kind of colums are string, needs to be converted to list

type(df_movies["genres_list"][0])

str

In [4]:
def string_transformation(text):
    if type(text) == str:
        text = text.lower().strip().replace(" ", "")
        text = unidecode(text)  # delete accents
        text = re.sub(r'[^\w\s]', '', text)  # delete special characters and punctuation marks
        return text
    else:
     return "Entered value is not valid." 

# Testing TfidfVectorizer

To understand well the use of this library, a series of differents objects are goint to be fit and transformed

In [5]:
tfidf = TfidfVectorizer(stop_words="english") # stop_words delete common words
test_list = ['Animation', 'Comedy', 'Family']
test_list2 = [ ['Animation', 'Comedy', 'Family'],  ['Animation', 'Comedy', 'Family']] # not allowed
test_list3 = df_movies["genres_list"][0:2] # only if they are string type
test_string = "Animation", "Comedy", "Family"
test_sentence = '''Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene.
                Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate 
                  Buzz and Woody from their owner, the duo eventually learns to put aside their differences.''' # not allowed
test_row = df_movies["overview"][0] # not allowed
test_combination = df_movies["genres_list"][0:2] + " " + df_movies["overview"][0:2] # only if they are string type

Not only these objects are going to be tested, the resulting vocabulary and bag of words can be seen as well.

## List

In [8]:
# can process list, but not list of lists 

# calculate parameters like weights, vectorize and transform to a matrix (bag of words)
tfidf_matrix = tfidf.fit_transform(test_list)
doc_term_matrix = tfidf_matrix.todense()

In [373]:
tfidf_matrix.toarray()

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [10]:
df = pd.DataFrame(doc_term_matrix, 
                  columns=tfidf.get_feature_names_out() , 
                  index= test_list
                  )
df

Unnamed: 0,animation,comedy,family
Animation,1.0,0.0,0.0
Comedy,0.0,1.0,0.0
Family,0.0,0.0,1.0


## Strings separated

In [374]:
tfidf.fit_transform(test_string)  # calculate parameters like weights, vectorize and transform to a matrix (bag of words)
tfidf.get_feature_names_out() 

array(['animation', 'comedy', 'family'], dtype=object)

## Rows of a dataframe (only if they are string type)

In [378]:
tfidf.fit_transform(test_list3)  # calculate parameters like weights, vectorize and transform to a matrix (bag of words)
tfidf.get_feature_names_out() 

array(['adventure', 'animation', 'comedy', 'family', 'fantasy'],
      dtype=object)

## An entire column of a dataframe

In [325]:
# doesn't process strings directly
# too big to show array

tfidf_matrix = tfidf.fit_transform(df_movies["overview"].fillna(""))  # calculate parameters like weights, vectorize and transform to a matrix (bag of words)
tfidf.get_feature_names_out() 

array(['00', '000', '000km', ..., '첫사랑', 'ﬁrst', 'ﬁve'], dtype=object)

In [327]:
len(tfidf.get_feature_names_out()) # number of words

75765

## Concatenation of two columns of a dataframe (only if they are string type)

In [319]:
tfidf_matrix = tfidf.fit_transform(test_combination)  # calculate parameters like weights, vectorize and transform to a matrix (bag of words)
tfidf.get_feature_names_out() 

array(['26', 'adult', 'adventure', 'afraid', 'alan', 'andy', 'animation',
       'aside', 'birthday', 'board', 'brings', 'buzz', 'circumstances',
       'comedy', 'creatures', 'differences', 'discover', 'door', 'duo',
       'enchanted', 'eventually', 'evil', 'family', 'fantasy', 'finish',
       'freedom', 'game', 'giant', 'happily', 'heart', 'hope', 'inside',
       'invite', 'judy', 'learns', 'led', 'lightyear', 'live', 'living',
       'losing', 'magical', 'monkeys', 'opens', 'owner', 'peter', 'place',
       'plots', 'proves', 'rhinoceroses', 'risky', 'room', 'running',
       'scene', 'separate', 'siblings', 'terrifying', 'toys', 'trapped',
       'unwittingly', 'woody', 'world', 'years'], dtype=object)

In [324]:
len(tfidf.get_feature_names_out()) # number of words

62

In [322]:
tfidf_matrix.toarray() # 2 rows, 64 words

array([[0.        , 0.        , 0.        , 0.14001087, 0.        ,
        0.4200326 , 0.14001087, 0.14001087, 0.14001087, 0.        ,
        0.14001087, 0.4200326 , 0.14001087, 0.14001087, 0.        ,
        0.14001087, 0.        , 0.        , 0.14001087, 0.        ,
        0.14001087, 0.        , 0.09961889, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.14001087, 0.14001087,
        0.        , 0.        , 0.        , 0.        , 0.14001087,
        0.14001087, 0.14001087, 0.14001087, 0.        , 0.14001087,
        0.        , 0.        , 0.        , 0.14001087, 0.        ,
        0.14001087, 0.14001087, 0.        , 0.        , 0.        ,
        0.09961889, 0.        , 0.14001087, 0.14001087, 0.        ,
        0.        , 0.14001087, 0.        , 0.        , 0.4200326 ,
        0.        , 0.        ],
       [0.14742195, 0.14742195, 0.14742195, 0.        , 0.29484389,
        0.        , 0.        , 0.        , 0.        , 0.14742195,
        0.     

# Recommendation Model

Like it was said in the exploratory data analysis notebook, it was mandotory for this project the use of a subset, because the free plan offered by Render have limited memory and cannot storage very big matrices.

In [150]:
df_train = df_movies[df_movies["vote_count"] >= 100].reset_index()
df_train.shape

(6050, 22)

As it's already done in the data cleaning phase, with AST library is possible to convert strings that represent an specific data structure to that data structure, like lists in this case.

In [135]:
df_train["genres_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_train["genres_list"]]
df_train["directors"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_train["directors"]]
df_train["spoken_languages_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_train["spoken_languages_list"]]
df_train["production_countries_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_train["production_countries_list"]]
df_train["production_companies_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_train["production_companies_list"]]

Now the elements of these columns are type list, they can be extracted as strings. It's probable that TfidfVectorizer() can process these columns in their original state as strings, ignoring special characters like square brackets and these transformations could be redundant, but it's way to guarantee the correct performance of the following functions.

In [136]:
df_train["genres_list"] = [x if None else ", ".join(x) for x in df_train["genres_list"]]
df_train["directors"] = [x if None else ", ".join(x) for x in df_train["directors"]]
df_train["spoken_languages_list"] = [x if None else ", ".join(x) for x in df_train["spoken_languages_list"]]
df_train["production_countries_list"] = [x if None else ", ".join(x) for x in df_train["production_countries_list"]]
df_train["production_companies_list"] = [x if None else ", ".join(x) for x in df_train["production_companies_list"]]

After the extraction, now the corpus of the experiment will be built with the following columns.

In [58]:
# important to use fillna(), because the result of concatenation between NaN and strings is a null value.

df_train["genres_list"] + df_train["overview"] + df_train["directors"] + df_train["collection"]  

0       Animation, Comedy, FamilyLed by Woody, Andy's ...
1                                                     NaN
2       ComedyJust when George Banks has recovered fro...
3                                                     NaN
4                                                     NaN
                              ...                        
6045    TV Movie, Family, Action, Comedy, Music, Adven...
6046                                                  NaN
6047    ComedyStuck in the corridors of time, Godefroy...
6048                                                  NaN
6049                                                  NaN
Length: 6050, dtype: object

In [137]:
df_train["corpus"] = df_train["title"].fillna("") + ", " +df_train["genres_list"].fillna("") + ", " + df_train["overview"].fillna("") + ", " + df_train["directors"].fillna("") + ", " + df_train["collection"].fillna("") 
df_train["corpus"]

0       Toy Story, Animation, Comedy, Family, Led by W...
1       Jumanji, Adventure, Fantasy, Family, When sibl...
2       Heat, Action, Crime, Drama, Thriller, Obsessiv...
3       GoldenEye, Adventure, Action, Thriller, James ...
4       Balto, Family, Animation, Adventure, An outcas...
                              ...                        
2705    Dunkirk, Action, Drama, History, Thriller, War...
2706    Transformers: The Last Knight, Action, Science...
2707    Three Men and a Leg, Comedy, Three friends lea...
2708    The Dark Tower, Action, Western, Science Ficti...
2709    Girls Trip, Comedy, Four girlfriends take a tr...
Name: corpus, Length: 2710, dtype: object

In [42]:
# Create bag of words
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words="english")
tfidf = vectorizer.fit_transform(df_train['corpus'])

In [57]:

# Apply LSA or LSI
lsa = TruncatedSVD(n_components=100, algorithm='arpack')
lsa.fit(tfidf)

In [72]:
# Create bag of words
vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words="english")
tfidf = vectorizer.fit_transform(df_train['corpus'])


user_movie = input("Enter a movie title: ")
# Find the index of the user movie
movie_index = df_train[df_train['title'] == user_movie].index[0]

# Compute the cosine similarities between the user movie and all other movies
similarity_scores = cosine_similarity(tfidf[movie_index], tfidf)

# Get the top 10 most similar movies
similar_movies = list(enumerate(similarity_scores[0]))
sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]

# Print the top 10 similar movies
for i, score in sorted_similar_movies:
    print("{}: {}".format(i, df_train.loc[i, 'title']))

242: The Silence of the Lambs
1919: Red Dragon
3003: Hannibal Rising
1576: Manhunter
2934: Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan
2479: Blade: Trinity
3763: The A-Team
2107: Matchstick Men
621: G.I. Jane
5716: I Am Wrath
700: Mercury Rising
5586: Solace
2654: Kingdom of Heaven
1330: Thelma & Louise
4669: The Counselor
4071: Tinker Tailor Soldier Spy
840: The Dead Zone
594: Donnie Brasco
2443: 1492: Conquest of Paradise


## Using TruncatedSVD LSA or LSI

In [143]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Crear bag of words
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
tfidf = vectorizer.fit_transform(df_train['corpus'])

user_movie = input("Enter a movie title: ")
# Encontrar el índice de la película del usuario
movie_index = df_train[df_train['title'] == user_movie].index[0]

# Aplicar LSA o LSI
lsa = TruncatedSVD(n_components=100, algorithm='arpack')
lsa.fit(tfidf)

# Transformar la matriz TF-IDF a una representación de menor dimensión
tfidf_lsa = lsa.transform(tfidf)

# Calcular las similitudes coseno entre la película del usuario y todas las demás películas en la representación LSA
similarity_scores = cosine_similarity(tfidf_lsa[movie_index].reshape(1, -1), tfidf_lsa)

# Obtener las 10 películas más similares
similar_movies = list(enumerate(similarity_scores[0]))
sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]

# Imprimir las 10 películas similares
for i, score in sorted_similar_movies:
    print("{}: {}".format(i, df_train.loc[i, 'title']))


2168128
1781: Cars 2
2697: Cars 3
439: A Bug's Life
1733: Rango
4: Balto
1407: Welcome to the Sticks
1931: ParaNorman
1304: The Simpsons Movie
1755: Welcome to the South
627: Blazing Saddles
2288: The Book of Life
800: Mr. Deeds
1590: I Love You Phillip Morris
794: Spirit: Stallion of the Cimarron
565: Easy Rider
1128: The Curse of the Were-Rabbit
1096: Herbie Fully Loaded
2407: Maggie
1271: Meet the Robinsons


## API Function

In [138]:
def get_recommendations(title):
    # Crear bag of words
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
    tfidf = vectorizer.fit_transform(df_train['corpus'])

    user_movie = title
    # Encontrar el índice de la película del usuario
    movie_index = df_train[df_train['title'] == user_movie].index[0]

    # Calcular las similitudes coseno entre la película del usuario y todas las demás películas en la representación LSA
    similarity_scores = cosine_similarity(tfidf[movie_index].reshape(1, -1), tfidf)

    # Obtener las 10 películas más similares
    similar_movies = list(enumerate(similarity_scores[0]))
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]

    # Crear el diccionario de películas recomendadas
    recommendations = {}
    for i, score in sorted_similar_movies:
        recommendations[i] = df_train.loc[i, 'title']

    return recommendations

get_recommendations("Cars")

{1781: 'Cars 2',
 2697: 'Cars 3',
 430: 'Bride of Chucky',
 182: 'Days of Thunder',
 439: "A Bug's Life",
 2288: 'The Book of Life',
 1571: 'Invictus',
 1204: 'Little Miss Sunshine',
 1539: 'The Final Destination',
 1321: 'The Game Plan',
 636: 'Gone in Sixty Seconds',
 0: 'Toy Story',
 794: 'Spirit: Stallion of the Cimarron',
 1733: 'Rango',
 627: 'Blazing Saddles',
 560: 'Toy Story 2',
 2309: 'Penguins of Madagascar',
 2015: 'G.I. Joe: Retaliation',
 2044: 'Epic'}

In [140]:
def get_recommendations(title):
    # Crear bag of words
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
    tfidf = vectorizer.fit_transform(df_train['corpus'])

    user_movie = title
    # Encontrar el índice de la película del usuario
    movie_index = df_train[df_train['title'] == user_movie].index[0]

    # Aplicar LSA o LSI
    lsa = TruncatedSVD(n_components=100, algorithm='arpack')
    lsa.fit(tfidf)

    # Transformar la matriz TF-IDF a una representación de menor dimensión
    tfidf_lsa = lsa.transform(tfidf)

    # Calcular las similitudes coseno entre la película del usuario y todas las demás películas en la representación LSA
    similarity_scores = cosine_similarity(tfidf_lsa[movie_index].reshape(1, -1), tfidf_lsa)

    # Obtener las 10 películas más similares
    similar_movies = list(enumerate(similarity_scores[0]))
    sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:20]

    # Crear el diccionario de películas recomendadas
    recommendations = {}
    for i, score in sorted_similar_movies:
        recommendations[i] = df_train.loc[i, 'title']

    return recommendations

get_recommendations("Cars")


{1781: 'Cars 2',
 2697: 'Cars 3',
 439: "A Bug's Life",
 1733: 'Rango',
 4: 'Balto',
 1407: 'Welcome to the Sticks',
 1931: 'ParaNorman',
 1304: 'The Simpsons Movie',
 1755: 'Welcome to the South',
 627: 'Blazing Saddles',
 2288: 'The Book of Life',
 800: 'Mr. Deeds',
 1590: 'I Love You Phillip Morris',
 794: 'Spirit: Stallion of the Cimarron',
 565: 'Easy Rider',
 1128: 'The Curse of the Were-Rabbit',
 1096: 'Herbie Fully Loaded',
 2407: 'Maggie',
 1271: 'Meet the Robinsons'}

In [141]:
import sys

my_variable = "Hola, mundo!"



61
