# Importing labraries and data loading

In [183]:
import pandas as pd
import numpy as np
import ast
from unidecode import unidecode
import re
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [184]:
df_movies = pd.read_csv("../processed_data/movies.csv")
df_movies.head(2)

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,runtime,title,vote_average,...,collection,genres_list,spoken_languages_list,production_companies_list,production_countries_list,release_year,return,release_month,release_day,directors
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Toy Story,7.7,...,Toy Story Collection,"['Animation', 'Comedy', 'Family']",['en'],['Pixar Animation Studios'],['US'],1995,12.45,octubre,lunes,['John Lasseter']
1,65000000.0,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Jumanji,6.9,...,,"['Adventure', 'Fantasy', 'Family']","['en', 'fr']","['TriStar Pictures', 'Teitler Film', 'Intersco...",['US'],1995,4.04,diciembre,viernes,['Joe Johnston']


In [3]:
# the type of the registers in these colums are string, needs to be converted to list

type(df_movies["genres_list"][0])

str

As it's already said in the data cleaning phase, with AST library is possible to convert strings that represent an specific data structure to that data structure, like lists in this case.

In [185]:
df_movies["genres_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["genres_list"]]
df_movies["directors"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["directors"]]
df_movies["spoken_languages_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["spoken_languages_list"]]
df_movies["production_countries_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["production_countries_list"]]
df_movies["production_companies_list"] = [x if pd.isnull(x) else ast.literal_eval(x) for x in df_movies["production_companies_list"]]


Now the elements of these columns are type list, they can be extracted as strings. It's probable that TfidfVectorizer() can process these columns in their original state as strings, ignoring special characters like square brackets and these transformations could be redundant, but it's way to guarantee the correct performance of the following functions.

In [186]:

df_movies["genres_list"] = [x if None else ", ".join(x) for x in df_movies["genres_list"]]
df_movies["directors"] = [x if None else ", ".join(x) for x in df_movies["directors"]]
df_movies["spoken_languages_list"] = [x if None else ", ".join(x) for x in df_movies["spoken_languages_list"]]
df_movies["production_countries_list"] = [x if None else ", ".join(x) for x in df_movies["production_countries_list"]]
df_movies["production_companies_list"] = [x if None else ", ".join(x) for x in df_movies["production_companies_list"]]

In [6]:
df_movies["genres_list"]

0         Animation, Comedy, Family
1        Adventure, Fantasy, Family
2                   Romance, Comedy
3            Comedy, Drama, Romance
4                            Comedy
                    ...            
45340        Drama, Action, Romance
45341                         Drama
45342       Action, Drama, Thriller
45343                              
45344                              
Name: genres_list, Length: 45345, dtype: object

This function was already used in the notebook of API functions for transform string into an 'standarized' form.

In [168]:
def string_transformation(text):
    if type(text) == str:
        text = text.lower().strip().replace(" ", "")
        text = unidecode(text)  # delete accents
        text = re.sub(r'[^\w\s]', '', text)  # delete special characters and punctuation marks
        return text
    else:
     return "Entered value is not valid." 

# Testing TfidfVectorizer

TfidfVectorizer() is the main tool that's going to be used in the recommendation model, to understand better the use of this function, a series of differents objects are goint to be fit and transformed

In [8]:
tfidf = TfidfVectorizer(stop_words="english") # stop_words delete common words
test_list = ['Animation', 'Comedy', 'Family']
test_list2 = [ ['Animation', 'Comedy', 'Family'],  ['Animation', 'Comedy', 'Family']] # list of lists not allowed
test_list3 = df_movies["genres_list"][0:2] # only allowed if they are string type, not allowed if they are in lists form
test_string = "Animation", "Comedy", "Family"
test_sentence = '''Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene.
                Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate 
                  Buzz and Woody from their owner, the duo eventually learns to put aside their differences.''' # pure string not allowed
test_row = df_movies["overview"][0] # pure string not allowed
test_combination = df_movies["genres_list"][0:2] + " " + df_movies["overview"][0:2] # only allowed if they are string type, not allowed if they are in lists form

Not only these objects are going to be tested, the resulting vocabulary and bag of words can be seen as well.

## List

In [18]:
# can process list, but not list of lists 

# calculate parameters like weights, vectorize and transform to a matrix (bag of words)
tfidf_matrix = tfidf.fit_transform(test_list)
list_matrix = tfidf_matrix.todense()

In [10]:
tfidf_matrix.toarray() # visualization of feature matrix

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [19]:
df = pd.DataFrame(list_matrix, 
                  columns=tfidf.get_feature_names_out() , 
                  index= test_list
                  )
df

Unnamed: 0,animation,comedy,family
Animation,1.0,0.0,0.0
Comedy,0.0,1.0,0.0
Family,0.0,0.0,1.0


## Strings separated

In [12]:
tfidf.fit_transform(test_string)   
tfidf.get_feature_names_out() 

array(['animation', 'comedy', 'family'], dtype=object)

## Rows of a dataframe (only if they are string type)

In [13]:
tfidf.fit_transform(test_list3)  
tfidf.get_feature_names_out() 

array(['adventure', 'animation', 'comedy', 'family', 'fantasy'],
      dtype=object)

## An entire column of a dataframe

In [14]:

# too big to show array

tfidf_matrix = tfidf.fit_transform(df_movies["overview"].fillna(""))   
tfidf.get_feature_names_out() 

array(['00', '000', '000km', ..., '첫사랑', 'ﬁrst', 'ﬁve'], dtype=object)

In [15]:
len(tfidf.get_feature_names_out()) # number of words

75765

## Concatenation of two columns of a dataframe (only if they are string type)

In [20]:
tfidf_matrix = tfidf.fit_transform(test_combination)   
tfidf.get_feature_names_out() 
concat_matrix = tfidf_matrix.todense()

In [21]:
len(tfidf.get_feature_names_out()) # number of words

62

In [22]:
tfidf_matrix.toarray() # 2 rows, 64 words

array([[0.        , 0.        , 0.        , 0.14001087, 0.        ,
        0.4200326 , 0.14001087, 0.14001087, 0.14001087, 0.        ,
        0.14001087, 0.4200326 , 0.14001087, 0.14001087, 0.        ,
        0.14001087, 0.        , 0.        , 0.14001087, 0.        ,
        0.14001087, 0.        , 0.09961889, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.14001087, 0.14001087,
        0.        , 0.        , 0.        , 0.        , 0.14001087,
        0.14001087, 0.14001087, 0.14001087, 0.        , 0.14001087,
        0.        , 0.        , 0.        , 0.14001087, 0.        ,
        0.14001087, 0.14001087, 0.        , 0.        , 0.        ,
        0.09961889, 0.        , 0.14001087, 0.14001087, 0.        ,
        0.        , 0.14001087, 0.        , 0.        , 0.4200326 ,
        0.        , 0.        ],
       [0.14742195, 0.14742195, 0.14742195, 0.        , 0.29484389,
        0.        , 0.        , 0.        , 0.        , 0.14742195,
        0.     

In [23]:
df = pd.DataFrame(concat_matrix, 
                  columns=tfidf.get_feature_names_out() 
                  )
df # two rows, two documents, 62 columns, 62 words in the vocabulary

Unnamed: 0,26,adult,adventure,afraid,alan,andy,animation,aside,birthday,board,...,scene,separate,siblings,terrifying,toys,trapped,unwittingly,woody,world,years
0,0.0,0.0,0.0,0.140011,0.0,0.420033,0.140011,0.140011,0.140011,0.0,...,0.140011,0.140011,0.0,0.0,0.140011,0.0,0.0,0.420033,0.0,0.0
1,0.147422,0.147422,0.147422,0.0,0.294844,0.0,0.0,0.0,0.0,0.147422,...,0.0,0.0,0.147422,0.147422,0.0,0.147422,0.147422,0.0,0.147422,0.147422


# Recommendation Model

In [206]:
# the corpus will be the input of the recommendation model
# it's important to use fillna(), because the result of concatenation between NaN and strings is a null value.
# genres and collection are added twice to give more weight to those attributes

df_movies["transformed_title"] = [string_transformation(x) for x in df_movies["title"]]
df_movies["corpus"] = (df_movies["title"].fillna("") + ", " + df_movies["genres_list"].fillna("") 
                    + ", " + df_movies["overview"].fillna("") + ", " + df_movies["directors"].fillna("") + ", " + df_movies["collection"].fillna("") 
                    + ", " + df_movies["genres_list"].fillna("") + ", " + df_movies["collection"].fillna("") )
df_movies["corpus"]

0        Toy Story, Animation, Comedy, Family, Led by W...
1        Jumanji, Adventure, Fantasy, Family, When sibl...
2        Grumpier Old Men, Romance, Comedy, A family we...
3        Waiting to Exhale, Comedy, Drama, Romance, Che...
4        Father of the Bride Part II, Comedy, Just when...
                               ...                        
45340    Robin Hood, Drama, Action, Romance, Yet anothe...
45341    Century of Birthing, Drama, An artist struggle...
45342    Betrayal, Action, Drama, Thriller, When one of...
45343    Satan Triumphant, , In a small town live two b...
45344    Queerama, , 50 years after decriminalisation o...
Name: corpus, Length: 45345, dtype: object

## Setting training data

Like it was said in the exploratory data analysis notebook, it is mandatory for this project the use of a fraction of the original dataset, because the free plan offered by Render have limited memory and cannot store very big matrices.

In [200]:
# the use of vote_count as the criteria to obtain a subset of the original dataset makes sense 

# because usually the movies that are recommended on these type of systems are, at the same time, the most popular and more recognized
chosen_columns = ["title", "transformed_title", 'genres_list', "directors", "corpus"]

df_train = df_movies[df_movies["vote_count"] >= 100][chosen_columns].reset_index()
df_train.shape

(6050, 6)

## Only using TfidfVectorizer

The first try will be the most simple. This model is a strong candidate to be the one  used in the final deployment because of the limitations already mentioned. Advantages: Calculate the matrices only once and consume low resources. Disadvantage: the input only receives movies that are in df_train.

In [224]:
# ngram_range is the amount of n-grams it will consider, in this case it will take unigrams and bigrams (from 1 to 2)
# stop_word to ignore common words in english
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")

# creating feature matrix given corpus
def feat_matrix(corpus):
    feature_matrix = vectorizer.fit_transform(corpus)
    return feature_matrix

# creating similarity matrix given feature matrix
def sim_matrix(feature_matrix):
    similarity_matrix = cosine_similarity(feature_matrix)
    return similarity_matrix

In [225]:
def get_recommendations(title, feature_matrix, similarity_matrix):
    # index of the entered title
    title = string_transformation(title)
    if title in df_train["transformed_title"].unique():
        movie_index = (df_train['transformed_title'] == title).idxmax()

        # get the similarity scores of the entered movie
        similarity_scores = similarity_matrix[movie_index]

        # argsort() sort ascending the values and obtain the index 
        # if it's sorted in an ascendent way, [::-1] invert the order to get top similarity scores
        # first 5 results, ignoring the entered movie which is in position 0 [1:6]
        top_index = similarity_scores.argsort()[::-1][1:6]

        # create a list of dictionaries with the recommendations
        recommendations = []
        
        for index in top_index: 
            movie_title = df_train['title'][index] # every iteration get this attributes
            genres = df_train['genres_list'][index]
            director = df_train['directors'][index]
            similarity_score = similarity_scores[index].round(4)
            
            recommendation = {
                'title': movie_title,
                'genres': genres,
                'director': director,
                'similarity': similarity_score
            }
            recommendations.append(recommendation)

        return recommendations
    else:
        return "Entered value is not valid."

In [226]:
feature_matrix = feat_matrix(df_train["corpus"])
similarity_matrix = sim_matrix(feature_matrix)

In [227]:
get_recommendations("toy story", feature_matrix, similarity_matrix)

[{'title': 'Toy Story 2',
  'genres': 'Animation, Comedy, Family',
  'director': 'John Lasseter',
  'similarity': 0.4022},
 {'title': 'Toy Story 3',
  'genres': 'Animation, Family, Comedy',
  'director': 'Lee Unkrich',
  'similarity': 0.3233},
 {'title': 'Toy Story of Terror!',
  'genres': 'Animation, Comedy, Family',
  'director': 'Angus MacLane',
  'similarity': 0.1661},
 {'title': 'Small Fry',
  'genres': 'Animation, Family',
  'director': 'Angus MacLane',
  'similarity': 0.1173},
 {'title': 'The 40 Year Old Virgin',
  'genres': 'Comedy, Romance',
  'director': 'Judd Apatow',
  'similarity': 0.1042}]

#### Testing the function step by step with movie in postion [1] (Jumanji)

In [211]:
feature_matrix = feat_matrix(df_train["corpus"]) # features matrix made of the corpus of training data
similarity_matrix = sim_matrix(feature_matrix) # similarity between movies, every row and columns of this matrix is a movie

In [212]:
print(feature_matrix.shape)
print(similarity_matrix.shape)

(6050, 183580)
(6050, 6050)


In [213]:
# contain the similarity score between an specific movie and all the other movies
# argsort() sort ascending the values and obtain the index
# [::-1] returns all the values but in reverse form, the major value first
# [1:6] gives the first 5 movies with biggest score, ignoring the entered movie which it's in position 0

similarity_matrix[1].argsort()[::-1][1:6]

array([2018, 1947, 1900, 3534, 2245], dtype=int64)

In [214]:
recommendations = [] # empty list for recommendations
        
for index in similarity_matrix[1].argsort()[::-1][1:6]: # go through the array and collect these attributes
    movie_title = df_train['title'][index]
    genres = df_train['genres_list'][index]
    director = df_train['directors'][index]
    similarity_score = similarity_matrix[1][index]
            
    recommendation = { # store the attributes in this object
                'title': movie_title,
                'genres': genres,
                'director': director,
                'similarity': similarity_score
            }
    recommendations.append(recommendation) # add to the empty list with all these objects

recommendations 

[{'title': 'Journey to the Center of the Earth',
  'genres': 'Adventure, Fantasy, Family',
  'director': 'Henry Levin',
  'similarity': 0.10688441057273533},
 {'title': 'Harry Potter and the Chamber of Secrets',
  'genres': 'Adventure, Fantasy, Family',
  'director': 'Chris Columbus',
  'similarity': 0.09095087598776133},
 {'title': 'Clash of the Titans',
  'genres': 'Adventure, Fantasy, Family',
  'director': 'Desmond Davis',
  'similarity': 0.08670344630780522},
 {'title': 'Harry Potter and the Half-Blood Prince',
  'genres': 'Adventure, Fantasy, Family',
  'director': 'David Yates',
  'similarity': 0.08219363338967213},
 {'title': 'Peter Pan',
  'genres': 'Adventure, Fantasy, Family',
  'director': 'P.J. Hogan',
  'similarity': 0.07830097831515392}]

## Using TruncatedSVD (Singular Value Decomposition)

SVD is used in algebra in factorization of complex matrix, in this context it's useful for dimension reduction. It's also known as LSA (Latent Semantic Analysis) or LSI (Latent Semantic Indexing). This could help the algorithm to get better recommendations. Here is a simple example.

In [215]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

corpus = [
    "El perro ladra.",
    "El gato maulla.",
    "El perro y el gato juegan juntos."
]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# dimension reduction
lsa = TruncatedSVD(n_components=2)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("Shape: ", tfidf_matrix.shape)
print("Reduced Matrix:")
print(lsa_matrix)
print("Shape: ", lsa_matrix.shape)


TF-IDF Matrix:
[[0.42544054 0.         0.         0.         0.72033345 0.
  0.54783215]
 [0.42544054 0.54783215 0.         0.         0.         0.72033345
  0.        ]
 [0.55364194 0.3564574  0.46869865 0.46869865 0.         0.
  0.3564574 ]]
Shape:  (3, 7)
Reduced Matrix:
[[ 6.99496047e-01  6.39922006e-01]
 [ 6.99496047e-01 -6.39922006e-01]
 [ 8.53151391e-01 -1.28558850e-16]]
Shape:  (3, 2)


Now applying it to the df_train dataset.

In [216]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")

def feat_matrix(corpus):
    feature_matrix = vectorizer.fit_transform(corpus)
    return feature_matrix

def sim_matrix(feature_matrix):
    # algorithm can take "arpack" or "randomized" as a way to reduce the dimensions
    # components 100 means that the new matrix will have 100 colums     
    lsa = TruncatedSVD(n_components=100, algorithm='arpack') 
    tfidf_lsa = lsa.fit_transform(feature_matrix) # applying LSA
    similarity_matrix = cosine_similarity(tfidf_lsa)
    return similarity_matrix

In [217]:
feature_matrix = feat_matrix(df_train["corpus"])
similarity_matrix = sim_matrix(feature_matrix)

In [220]:
get_recommendations("toystory", feature_matrix, similarity_matrix)

[{'title': 'Toy Story 2',
  'genres': 'Animation, Comedy, Family',
  'director': 'John Lasseter',
  'similarity': 0.9828},
 {'title': 'Toy Story of Terror!',
  'genres': 'Animation, Comedy, Family',
  'director': 'Angus MacLane',
  'similarity': 0.8101},
 {'title': 'Garfield: A Tail of Two Kitties',
  'genres': 'Animation, Comedy, Family',
  'director': 'Tim Hill',
  'similarity': 0.71},
 {'title': 'Hoodwinked!',
  'genres': 'Animation, Comedy, Family',
  'director': 'Cory Edwards, Todd Edwards, Tony Leech',
  'similarity': 0.7025},
 {'title': 'Banana',
  'genres': 'Animation, Comedy, Family',
  'director': 'Kyle Balda, Samuel Tourneux',
  'similarity': 0.7007}]

The inclusion of SVD into the algoritm shows improvement in the results, but it's more demanding in terms of resources.

## Lemmatization and Stemming

Lemmatization and Stemming are Text Normalization techniques, stemming replace words “history” and “historical” with “histori”, Lemmatization considers the context and converts the word to its meaningful base form, lemmatizing the word ‘Caring‘ would return ‘Care‘.

In [58]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

### Stemming

In [84]:
# testing the functions with the Toy Story, the movie in position 0.

test = [word_tokenize(x) for x in df_train["corpus"]][0] 
test

['Toy',
 'Story',
 'Animation',
 ',',
 'Comedy',
 ',',
 'Family',
 'Led',
 'by',
 'Woody',
 ',',
 'Andy',
 "'s",
 'toys',
 'live',
 'happily',
 'in',
 'his',
 'room',
 'until',
 'Andy',
 "'s",
 'birthday',
 'brings',
 'Buzz',
 'Lightyear',
 'onto',
 'the',
 'scene',
 '.',
 'Afraid',
 'of',
 'losing',
 'his',
 'place',
 'in',
 'Andy',
 "'s",
 'heart',
 ',',
 'Woody',
 'plots',
 'against',
 'Buzz',
 '.',
 'But',
 'when',
 'circumstances',
 'separate',
 'Buzz',
 'and',
 'Woody',
 'from',
 'their',
 'owner',
 ',',
 'the',
 'duo',
 'eventually',
 'learns',
 'to',
 'put',
 'aside',
 'their',
 'differences',
 '.',
 'John',
 'Lasseter',
 'Toy',
 'Story',
 'Collection',
 'Animation',
 ',',
 'Comedy',
 ',',
 'Family',
 'Toy',
 'Story',
 'Collection']

In [85]:
stemmer = PorterStemmer()
[stemmer.stem(word) for word in test  if word not in set(stopwords.words('english'))] # excluding stopwords


['toy',
 'stori',
 'anim',
 ',',
 'comedi',
 ',',
 'famili',
 'led',
 'woodi',
 ',',
 'andi',
 "'s",
 'toy',
 'live',
 'happili',
 'room',
 'andi',
 "'s",
 'birthday',
 'bring',
 'buzz',
 'lightyear',
 'onto',
 'scene',
 '.',
 'afraid',
 'lose',
 'place',
 'andi',
 "'s",
 'heart',
 ',',
 'woodi',
 'plot',
 'buzz',
 '.',
 'but',
 'circumst',
 'separ',
 'buzz',
 'woodi',
 'owner',
 ',',
 'duo',
 'eventu',
 'learn',
 'put',
 'asid',
 'differ',
 '.',
 'john',
 'lasset',
 'toy',
 'stori',
 'collect',
 'anim',
 ',',
 'comedi',
 ',',
 'famili',
 'toy',
 'stori',
 'collect']

In [221]:
stemmer = PorterStemmer()
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")

def feat_matrix(corpus):   
    corpus_tokenized = [word_tokenize(doc) for doc in corpus]
    # the corpus is in a pandas series, so it's a list of documents, that's way the join is needed
    corpus_stemmed = [' '.join([stemmer.stem(word) for word in doc if word not in set(stopwords.words('english'))]) for doc in corpus_tokenized]
    feature_matrix = vectorizer.fit_transform(corpus_stemmed)
    return feature_matrix

def sim_matrix(feature_matrix):
    similarity_matrix = cosine_similarity(feature_matrix)
    return similarity_matrix

In [222]:
feature_matrix = feat_matrix(df_train["corpus"])
similarity_matrix = sim_matrix(feature_matrix)

In [223]:
get_recommendations("toy story", feature_matrix, similarity_matrix)

[{'title': 'Toy Story 2',
  'genres': 'Animation, Comedy, Family',
  'director': 'John Lasseter',
  'similarity': 0.4335},
 {'title': 'Toy Story 3',
  'genres': 'Animation, Family, Comedy',
  'director': 'Lee Unkrich',
  'similarity': 0.3378},
 {'title': 'Toy Story of Terror!',
  'genres': 'Animation, Comedy, Family',
  'director': 'Angus MacLane',
  'similarity': 0.1968},
 {'title': 'Small Fry',
  'genres': 'Animation, Family',
  'director': 'Angus MacLane',
  'similarity': 0.1196},
 {'title': 'Hawaiian Vacation',
  'genres': 'Animation, Family',
  'director': 'Gary Rydstrom',
  'similarity': 0.1036}]

### Lemmatization

In [87]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in test  if word not in set(stopwords.words('english'))]

['Toy',
 'Story',
 'Animation',
 ',',
 'Comedy',
 ',',
 'Family',
 'Led',
 'Woody',
 ',',
 'Andy',
 "'s",
 'toy',
 'live',
 'happily',
 'room',
 'Andy',
 "'s",
 'birthday',
 'brings',
 'Buzz',
 'Lightyear',
 'onto',
 'scene',
 '.',
 'Afraid',
 'losing',
 'place',
 'Andy',
 "'s",
 'heart',
 ',',
 'Woody',
 'plot',
 'Buzz',
 '.',
 'But',
 'circumstance',
 'separate',
 'Buzz',
 'Woody',
 'owner',
 ',',
 'duo',
 'eventually',
 'learns',
 'put',
 'aside',
 'difference',
 '.',
 'John',
 'Lasseter',
 'Toy',
 'Story',
 'Collection',
 'Animation',
 ',',
 'Comedy',
 ',',
 'Family',
 'Toy',
 'Story',
 'Collection']

In [228]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
lemmatizer = WordNetLemmatizer()

def feat_matrix(corpus):   
    corpus_tokenized = [word_tokenize(doc) for doc in corpus]
     # the corpus is in a pandas series, so it's a list of documents, that's way the join is needed
    corpus_stemmed = [' '.join([lemmatizer.lemmatize(word) for word in doc if word not in set(stopwords.words('english'))]) for doc in corpus_tokenized]
    feature_matrix = vectorizer.fit_transform(corpus_stemmed)
    return feature_matrix

def sim_matrix(feature_matrix):
    similarity_matrix = cosine_similarity(feature_matrix)
    return similarity_matrix

In [229]:
feature_matrix = feat_matrix(df_train["corpus"])
similarity_matrix = sim_matrix(feature_matrix)

In [230]:
get_recommendations("toy story", feature_matrix, similarity_matrix)

[{'title': 'Toy Story 2',
  'genres': 'Animation, Comedy, Family',
  'director': 'John Lasseter',
  'similarity': 0.4261},
 {'title': 'Toy Story 3',
  'genres': 'Animation, Family, Comedy',
  'director': 'Lee Unkrich',
  'similarity': 0.358},
 {'title': 'Toy Story of Terror!',
  'genres': 'Animation, Comedy, Family',
  'director': 'Angus MacLane',
  'similarity': 0.1958},
 {'title': 'Small Fry',
  'genres': 'Animation, Family',
  'director': 'Angus MacLane',
  'similarity': 0.1226},
 {'title': 'The 40 Year Old Virgin',
  'genres': 'Comedy, Romance',
  'director': 'Judd Apatow',
  'similarity': 0.104}]

The process of stemming and lemmatization is time and resource consuming and, in this case, didn't improve very much the output of the function.

## Using the entire dataset

All of the models tested until now have only used the subset df_train. In this try, for memory problems already mentioned, the df_train still going to be used to fit the model, but a movie that isn't in that file can now be inputted. Advantages: the recommendation model isn't restricted in terms of movie range like the simplest model. Disadvantage: a similarity matrix needs to be calculated every time by every function execution, but the product will be between a matrix and a single vector, because of that the consume of resources isn't very high.

In [236]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")

def feat_matrix(corpus):
    tfidf_fit = vectorizer.fit(corpus)
    feature_matrix = tfidf_fit.transform(corpus)
    return tfidf_fit, feature_matrix # tuple of fitting data and feature matrix

In [242]:
def get_recommendations(title, tfidf_fit, feature_matrix):
    
    title = string_transformation(title)
    if title in df_movies["transformed_title"].unique():
        # vectorization of the corpus of entered movie   
        new_movie_vector = tfidf_fit.transform(df_movies[df_movies["transformed_title"] == title]["corpus"])

        # similarity matrix between movies in df_train and entered movie
        similarity_matrix = cosine_similarity(new_movie_vector, feature_matrix)

        # the product of the feature matrix and the entered movie vector is a matrix where [0] is the
        # similarity between that movie and the movies in df_train
        # output of enumarate is a tuple (movie index, score) 
        similar_movies = list(enumerate(similarity_matrix[0]))

        # if the movie is in the df_train, ignores the first recommendation because it's the same introduced movied.
        # that means that the slicing of the sorted movies should start at postion 1.
        if title in df_train["transformed_title"].unique():

            # the lambda function sort the list of tuples by the second position x[1]
            sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:6]

            recommendations = []
            for index, score in sorted_similar_movies: # iter the list of tuples and store values
                movie_title = df_train['title'][index]
                genres = df_train['genres_list'][index]
                director = df_train['directors'][index]
                similarity_score = score

                recommendation = {
                    'title': movie_title,
                    'genres': genres,
                    'director': director,
                    'similarity': similarity_score
                }
                recommendations.append(recommendation)

            return recommendations
        
        else: # if the movie is NOT in the df_train, the slicing of similar movies list will start at postion 0
             sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[0:5]

             recommendations = []
             for index, score in sorted_similar_movies: # iter the list of tuples and store values
                movie_title = df_train['title'][index]
                genres = df_train['genres_list'][index]
                director = df_train['directors'][index]
                similarity_score = score

                recommendation = {
                    'title': movie_title,
                    'genres': genres,
                    'director': director,
                    'similarity': similarity_score
                }
                recommendations.append(recommendation)

             return recommendations           
    else:
        return "Entered value is not valid."

In [237]:
parameters = feat_matrix(df_train["corpus"])
tfidf_fit = parameters[0]
feature_matrix = parameters[1]

In [247]:
get_recommendations("thorragnarok", tfidf_fit, feature_matrix)

[{'title': 'Thor',
  'genres': 'Adventure, Fantasy, Action',
  'director': 'Kenneth Branagh',
  'similarity': 0.4224364112212692},
 {'title': 'Thor: The Dark World',
  'genres': 'Action, Adventure, Fantasy',
  'director': 'Alan Taylor',
  'similarity': 0.4115334206026653},
 {'title': "Frank Herbert's Dune",
  'genres': 'Action, Adventure, Fantasy, Science Fiction',
  'director': 'John Harrison',
  'similarity': 0.192063838731912},
 {'title': 'Avatar',
  'genres': 'Action, Adventure, Fantasy, Science Fiction',
  'director': 'James Cameron',
  'similarity': 0.1827991837263023},
 {'title': 'Doctor Strange',
  'genres': 'Action, Adventure, Fantasy, Science Fiction',
  'director': 'Scott Derrickson',
  'similarity': 0.18212850554468438}]

In [None]:
# another way

# similar_movies = list(enumerate(similarity_matrix[0]))
# sorted_indices = np.argsort(-similarity_matrix[0])  

# # Obtener las 5 películas más similares
# top_indices = sorted_indices[1:6]  

# recommendations = []
# for i in top_indices:
#     movie_title = df_train.loc[i, 'title']
#     genres = df_train.loc[i, 'genres_list']
#     director = df_train.loc[i, 'directors']
#     similarity_score = similarity_matrix[0, i]

#     recommendation = {
#         'title': movie_title,
#         'genres': genres,
#         'director': director,
#         'similarity': similarity_score
#     }
#     recommendations.append(recommendation)
