## **RecSys Content-based**

Content-based recommendation systems are a widely used approach to personalize user experiences on digital platforms, such as streaming services, e-commerce, and social networks. This methodology recommends items (such as movies, books, products) based on the characteristics of the items themselves and the preferences previously demonstrated by the user.

### **Setup**

In [1]:
!pip install openai -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.8/326.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from openai import AzureOpenAI
import os
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

### **Get Data**

In [3]:
df = pd.read_excel("/content/movie data_new.xlsx")
df.head()

Unnamed: 0.1,Unnamed: 0,Movie Name,Year of Release,Watch Time,Genre,Movie Rating,Metascore of movie,Director,Cast,Votes,Description
0,0,Free Guy,2021,115 min,"\nAction, Comedy, Sci-Fi",7.6,62.0,Shawn Levy,"Ryan Reynolds,Jodie Comer,Taika Waititi",29441.0,A bank teller discovers that he's actually an ...
1,1,The Suicide Squad,2021,132 min,"\nAction, Adventure, Comedy",7.4,72.0,James Gunn,"Margot Robbie,Idris Elba,John Cena",156225.0,"Supervillains Harley Quinn, Bloodsport, Peacem..."
2,2,Reminiscence,2021,116 min,"\nMystery, Romance, Sci-Fi",5.9,46.0,Lisa Joy,"Hugh Jackman,Rebecca Ferguson,Thandiwe Newton",11879.0,"Nick Bannister, a private investigator of the ..."
3,3,Beckett,2021,110 min,"\nAction, Crime, Drama",5.6,52.0,Ferdinando Cito Filomarino,"John David Washington,Boyd Holbrook,Vicky Krieps",14267.0,"Following a tragic car accident in Greece, Bec..."
4,4,Eternals,2021,,"\nAction, Adventure, Drama",,,Chloé Zhao,"Richard Madden,Salma Hayek,Angelina Jolie",,"The saga of the Eternals, a race of immortal b..."


In [4]:
# Rename columns into dataframe
df.rename(columns={'Unnamed: 0': 'movie_id', 'Movie Name':'Title' }, inplace=True)
Columns = ['Cast', 'Director','Genre', 'Title', 'Description' ]

In [5]:
# Combine important feature into dataframe for the embeddings step

def get_features_combined(data):
    features_combined=[]
    for i in range (0,data.shape[0]):
        features_combined.append(data['Title'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Description'][i])
    return features_combined

df['features_combined'] = get_features_combined(df)
df.head()

Unnamed: 0,movie_id,Title,Year of Release,Watch Time,Genre,Movie Rating,Metascore of movie,Director,Cast,Votes,Description,features_combined
0,0,Free Guy,2021,115 min,"\nAction, Comedy, Sci-Fi",7.6,62.0,Shawn Levy,"Ryan Reynolds,Jodie Comer,Taika Waititi",29441.0,A bank teller discovers that he's actually an ...,"Free Guy Shawn Levy \nAction, Comedy, Sci-Fi ..."
1,1,The Suicide Squad,2021,132 min,"\nAction, Adventure, Comedy",7.4,72.0,James Gunn,"Margot Robbie,Idris Elba,John Cena",156225.0,"Supervillains Harley Quinn, Bloodsport, Peacem...","The Suicide Squad James Gunn \nAction, Adventu..."
2,2,Reminiscence,2021,116 min,"\nMystery, Romance, Sci-Fi",5.9,46.0,Lisa Joy,"Hugh Jackman,Rebecca Ferguson,Thandiwe Newton",11879.0,"Nick Bannister, a private investigator of the ...","Reminiscence Lisa Joy \nMystery, Romance, Sci-..."
3,3,Beckett,2021,110 min,"\nAction, Crime, Drama",5.6,52.0,Ferdinando Cito Filomarino,"John David Washington,Boyd Holbrook,Vicky Krieps",14267.0,"Following a tragic car accident in Greece, Bec...","Beckett Ferdinando Cito Filomarino \nAction, C..."
4,4,Eternals,2021,,"\nAction, Adventure, Drama",,,Chloé Zhao,"Richard Madden,Salma Hayek,Angelina Jolie",,"The saga of the Eternals, a race of immortal b...","Eternals Chloé Zhao \nAction, Adventure, Drama..."


### **Using TF-IDF**

In [16]:
# inicialization of model and building matrix of similarity

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features_combined'])

cosine_sim_tfidf = linear_kernel(tfidf_matrix, tfidf_matrix)

In [17]:
movie_title = df['Title']
indices_tfidf = pd.Series(df.index, index=df['Title']).drop_duplicates()

### **Using GPT embeddings model**

In [10]:
# inicialization of client to use the embedding model from azure

client = AzureOpenAI(
  api_key = "KEY",
  api_version = "2023-05-15",
  azure_endpoint ="ENDPOINT"
)

In [None]:
# inicialization of embedding model and building matrix of similarity

def get_embeddings_azure(text):
    response = client.embeddings.create(
      input = text,
      model= "text-embedding"
  )
    embeddings = response.data[0].embedding
    return np.array(embeddings)

document_embeddings = np.vstack([get_embeddings_azure(doc) for doc in tqdm(df['features_combined'])])

cosine_sim_gpt = cosine_similarity(document_embeddings, document_embeddings)

indices_gpt = pd.Series(df.index, index=df['Title'])

## **Testing the recommendations**

In [21]:
# Function to make recommendation. You can choose the embedding model (TFIDF or GPT)

def get_recommendations(title, model:str):

    if model == 'tfidf':
        indices = indices_tfidf
        cosine_sim = cosine_sim_tfidf
    elif model == 'gpt':
        indices = indices_gpt
        cosine_sim = cosine_sim_gpt

    idx = indices[title]
    # get score similarity of all the movies in the dataset
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on score similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]

    # Return the 5 movies recommended
    movies = df['Title'].iloc[movie_indices]
    final_df = pd.DataFrame({"Movies": movies})
    final_df.reset_index(drop=True, inplace=True)

    return final_df

# Run function
recommendations = get_recommendations('Toy Story', 'tfidf')
print(recommendations)

          Movies
0    Toy Story 4
1   Toy Soldiers
2           Cars
3  The Water Man
4           Vivo


In [22]:
get_recommendations('Toy Story', 'gpt')

Unnamed: 0,Movies
0,Toy Story 4
1,Cars
2,Toy Soldiers
3,Space Cowboys
4,Star Wars
