<a href="https://colab.research.google.com/github/donghuna/AI-Expert/blob/main/%EC%9D%B4%EB%8F%99%ED%95%98/recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Recommendation System with Text Embedding

![Alt text](https://repository-images.githubusercontent.com/275336521/20d38e00-6634-11eb-9d1f-6a5232d0f84f)

## Import requirements

In [None]:
! pip install faiss-gpu

In [None]:
! pip install sentence_transformers ## For textual similarity, using pretrained models

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from tqdm.notebook import tqdm
import faiss
from sentence_transformers import SentenceTransformer, util

## Load Dataset and Model

In [None]:
imdb_movies=pd.read_csv("https://raw.githubusercontent.com/tommyEzreal/0701_samsung/main/imdb_movies_0626.csv")
imdb_movies

# we will encode 'Description' columnn for our recommendation system

In [None]:
k = 0
print("Movie Title:",imdb_movies['Movie Name'][k])
print("Movie description:", imdb_movies['Description'][k])

In [None]:
# load bi-encoder model
model = SentenceTransformer('paraphrase-distilroberta-base-v1') # Loads the distil roberta model,whcih was trained on millions of data

## Query-based Recommendation

![Example Image](https://drive.google.com/uc?id=1biO4IGKqeiyChgOBVjqxdnvitBWImGkn)


- `User Query Input`: The user inputs a description or keywords for the movie they want to watch.
- `Query Embedding`: Convert the user's query into an embedding vector. This involves mapping the text into a high-dimensional vector space using a natural language processing model.
- `Movie Description Embeddings`: Similarly, convert each movie description in the dataframe into embedding vectors.
- `Calculating Similarity`:
Compute the cosine similarity between the embedding vector of the user's query and the embedding vectors of all movie descriptions. Cosine similarity measures how similar two vectors are by calculating the cosine of the angle between them.
Recommend the Most Similar Movie: Recommend the movie with the highest similarity score.
- `Recommend the Most Similar Movie` :
Identify the movie with the highest similarity score and recommend it to the user as the best match for their query.



In [None]:
# what is faiss? : reduce the computational cost of calculating embedding similarity
# prior to calculating simialrity, first cluster the index pool -> assign query to most similar cluster , then calculate the similarity in the cluster

def make_faiss_index(df):
    descriptions=df['Description'].tolist()
    print("Number of Movie Description ",len(descriptions))

    # for each movie description, encode and get embeddings
    des_embeddings = []
    for des in tqdm(descriptions):
        des_embeddings.append(model.encode(des))

    # convert the list to a numpy array
    des_embeddings_array = np.array(des_embeddings)
    print("Shape of the EMbeddings is ",des_embeddings_array.shape)

    faiss.normalize_L2(des_embeddings_array) ## Normalising the Embeddings


    dim=768 # we get a 768 dimension vector following the dimension of Roberta
    ncentroids=50 # this is a hyperparameter, and indicates number of clusters to be split into
    m=16 #
    quantiser = faiss.IndexFlatL2(dim)
    index = faiss.IndexIVFPQ (quantiser, dim,ncentroids, m , 8)
    index.train(des_embeddings_array) # this step, will do the clustering and create the clusters
    faiss.write_index(index, "trained.index")

    # add the embeddings to the trained Index.
    ids=df['id'].tolist()
    ids=np.array(ids)
    index.add_with_ids(des_embeddings_array,ids)
    faiss.write_index(index,"block.index")

    return des_embeddings, index

des_embeddings, index = make_faiss_index(imdb_movies)

In [None]:
# calculate cosine similarity
def calculateInnerProduct(L2_score):
    return (2-math.pow(L2_score,2))/2


def searchFAISSIndex(data,id_col_name,query,index,nprobe,model,topk=10):
    #convert the query into embeddings
    query_embedding=model.encode([query])[0]
    dim=query_embedding.shape[0]
    query_embedding=query_embedding.reshape(1,dim)
    faiss.normalize_L2(query_embedding)

    index.nprobe=nprobe

    D,I=index.search(query_embedding,topk)
    ids=[i for i in I][0]
    L2_score=[d for d in D][0]
    inner_product=[calculateInnerProduct(l2) for l2 in L2_score]

    # visualization
    search_result=pd.DataFrame()
    search_result[id_col_name]=ids
    search_result['cosine_sim']=inner_product
    dat=data[data[id_col_name].isin(ids)]
    dat=pd.merge(dat,search_result,on=id_col_name)
    dat=dat.sort_values('cosine_sim',ascending=False)

    return dat, query_embedding

In [None]:
# use your own query !

query="Super Hero movie"
search_result, query_embedding =searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=10)
search_result=search_result[['id','Description','Movie Name','cosine_sim']]
search_result

In [None]:
query="Recommend me a romantic comedy"
search_result, query_embedding=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=10)
search_result=search_result[['id','Description','Movie Name','cosine_sim']]
search_result

In [None]:
query="Kids Animation"
search_result, query_embedding=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=10)
search_result=search_result[['id','Description','Movie Name','cosine_sim']]
search_result

### embedding visualization

In [None]:
# Visualize embeddings using T-SNE
def visualize_embeddings_3queries(data1, embeddings1, query_embedding1, data2, embeddings2, query_embedding2, data3, embeddings3, query_embedding3, query1, query2, query3, perplexity=5):
    # Combine the embeddings
    combined_embeddings = np.vstack([embeddings1, query_embedding1, embeddings2, query_embedding2, embeddings3, query_embedding3])

    # Apply T-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    tsne_results = tsne.fit_transform(combined_embeddings)

    # Generate colors
    palette = sns.color_palette("muted", 3)

    # Plot the results
    plt.figure(figsize=(14, 10))

    # Plot the first query results
    scatter1 = plt.scatter(
        tsne_results[:len(embeddings1), 0], tsne_results[:len(embeddings1), 1],
        color=palette[0], label=f'"{query1}" Results', alpha=0.7, marker='o'
    )

    # Plot the second query results
    scatter2 = plt.scatter(
        tsne_results[len(embeddings1) + 1:len(embeddings1) + 1 + len(embeddings2), 0], tsne_results[len(embeddings1) + 1:len(embeddings1) + 1 + len(embeddings2), 1],
        color=palette[1], label=f'"{query2}" Results', alpha=0.7, marker='^'
    )

    # Plot the third query results
    scatter3 = plt.scatter(
        tsne_results[len(embeddings1) + 1 + len(embeddings2) + 1:-1, 0], tsne_results[len(embeddings1) + 1 + len(embeddings2) + 1:-1, 1],
        color=palette[2], label=f'"{query3}" Results', alpha=0.7, marker='s'
    )

    # Highlight the query points
    plt.scatter(tsne_results[len(embeddings1), 0], tsne_results[len(embeddings1), 1], c=palette[0], marker='X', s=200, label=query1)
    plt.scatter(tsne_results[len(embeddings1) + 1 + len(embeddings2), 0], tsne_results[len(embeddings1) + 1 + len(embeddings2), 1], c=palette[1], marker='X', s=200, label=query2)
    plt.scatter(tsne_results[-1, 0], tsne_results[-1, 1], c=palette[2], marker='X', s=200, label=query3)

    # Annotate the points with movie names
    for i, txt in enumerate(data1['Movie Name']):
        plt.text(tsne_results[i, 0], tsne_results[i, 1], txt, fontsize=9, color=palette[0])

    for i, txt in enumerate(data2['Movie Name']):
        plt.text(tsne_results[len(embeddings1) + 1 + i, 0], tsne_results[len(embeddings1) + 1 + i, 1], txt, fontsize=9, color=palette[1])

    for i, txt in enumerate(data3['Movie Name']):
        plt.text(tsne_results[len(embeddings1) + 1 + len(embeddings2) + 1 + i, 0], tsne_results[len(embeddings1) + 1 + len(embeddings2) + 1 + i, 1], txt, fontsize=9, color=palette[2])

    plt.legend()
    plt.title('T-SNE visualization of movie embeddings for three queries')
    plt.show()

# Example usage
query1 = "Super Hero movie"
query2 = "Recommend me a romantic comedy"
query3 = "Kids Animation"

# Search results for query 1
search_result1, query_embedding1 = searchFAISSIndex(imdb_movies, "id", query1, index, nprobe=10, model=model, topk=10)
search_result1 = search_result1[['id', 'Description', 'Movie Name', 'cosine_sim']]
embeddings1 = np.array([model.encode([desc])[0] for desc in search_result1['Description'].tolist()])

# Search results for query 2
search_result2, query_embedding2 = searchFAISSIndex(imdb_movies, "id", query2, index, nprobe=10, model=model, topk=10)
search_result2 = search_result2[['id', 'Description', 'Movie Name', 'cosine_sim']]
embeddings2 = np.array([model.encode([desc])[0] for desc in search_result2['Description'].tolist()])

# Search results for query 3
search_result3, query_embedding3 = searchFAISSIndex(imdb_movies, "id", query3, index, nprobe=10, model=model, topk=10)
search_result3 = search_result3[['id', 'Description', 'Movie Name', 'cosine_sim']]
embeddings3 = np.array([model.encode([desc])[0] for desc in search_result3['Description'].tolist()])

# Visualize with adjusted perplexity
visualize_embeddings_3queries(search_result1, embeddings1, query_embedding1, search_result2, embeddings2, query_embedding2, search_result3, embeddings3, query_embedding3, query1, query2, query3, perplexity=5)


## User Histroy-based Recommendation



![Example Image](https://drive.google.com/uc?id=16dw0Ud2-Wui62WJ4ZHRoYb6XRpR58ecr)

- `User Viewing History Input`:
The system accesses the user's viewing history, which includes descriptions of all the movies the user has watched.

- `Movie Description Embeddings`: Convert each movie description in the user's viewing history into embedding vectors using a pre-trained language model (e.g., BERT).
- `User Embedding`: Calculate the average of all movie description embeddings from the user's viewing history to create a single embedding vector representing the user's overall movie preferences.
- `Calculating Similarity`:
Calculate the cosine similarity between the user's embedding vector and the embedding vectors of all movies in the database. Cosine similarity measures the cosine of the angle between two vectors, indicating how similar the vectors are. A similarity score closer to 1 means the vectors are more similar.
- `Recommend Movies`:
Identify the movies with the highest similarity scores and recommend them to the user as the best matches for their viewing history and preferences.

In [None]:
def make_user_history(df):

    user_ids = sorted(df['cluster'].unique())

    user_history = {}
    for user_id in user_ids:
        # synthetic user history
        description_list = df[df['cluster']==user_id]['Description'].tolist()[:50]
        movie_name_list = df[df['cluster']==user_id]['Movie Name'].tolist()[:50]
        ids = df[df['cluster']==user_id]['id'].tolist()[:50]
        user_history[f'user_{user_id}'] = {'Movie Name': movie_name_list, "Description": description_list, "id": ids}

        #prediction_pool
        indices_to_drop = df[df['id'].isin(ids)].index
        prediction_pool = df.drop(indices_to_drop)
        user_history[f'user_{user_id}']['pred_pool'] = prediction_pool

        #GT
        GT_description = df[df['cluster']==user_id]['Description'].tolist()[50:]
        GT_movie_name = df[df['cluster']==user_id]['Movie Name'].tolist()[50:]

        user_history[f'user_{user_id}']['GT'] = [(d, n) for d,n in zip(GT_description, GT_movie_name)]

    return user_history

user_history = make_user_history(imdb_movies)

In [None]:
user_history['user_6']['Movie Name']

In [None]:
prediction_pool_df = user_history['user_6']['pred_pool']

prediction_pool_df=prediction_pool_df.reset_index(drop=True)
prediction_pool_df['id']=prediction_pool_df.index

pool_embeddings, index = make_faiss_index(prediction_pool_df)

In [None]:
def make_user_embedding(user_history):
    ## Convert the list of user history into embeddings
    user_embeddings = [model.encode([hist])[0] for hist in tqdm(user_history)]
    print(user_embeddings[0].shape)

    ## Calculate the mean of the embeddings
    avg_user_embedding = np.mean(user_embeddings, axis=0) ## FILL ##
    print(avg_user_embedding.shape)

    return avg_user_embedding

In [None]:
user_history_sample = user_history['user_6']['Description']
sample_embedding = make_user_embedding(user_history_sample)

In [None]:
def searchFAISSIndex_user_history(data, id_col_name, user_history, index, nprobe, model, topk=50):

    avg_user_embedding = make_user_embedding(user_history)

    # Reshape the average query embedding
    dim = avg_user_embedding.shape[0]
    avg_user_embedding = avg_user_embedding.reshape(1, dim)

    # Normalize the query embedding
    faiss.normalize_L2(avg_user_embedding)

    # Set the nprobe parameter
    index.nprobe = nprobe

    # Perform the search
    D, I = index.search(avg_user_embedding, topk)

    # Extract IDs and L2 scores
    ids = [i for i in I][0]
    L2_score = [d for d in D][0]

    # Calculate cosine similarities
    inner_product = [calculateInnerProduct(l2) for l2 in L2_score]

    # Visualization
    search_result = pd.DataFrame()
    search_result[id_col_name] = ids
    search_result['cosine_sim'] = inner_product
    dat = data[data[id_col_name].isin(ids)]
    dat = pd.merge(dat, search_result, on=id_col_name)
    dat = dat.sort_values('cosine_sim', ascending=False)

    return dat, avg_user_embedding


In [None]:
user_history_input = user_history['user_6']['Description']
recommend_pool = user_history['user_6']['pred_pool']

search_result, avg_user_embedding=searchFAISSIndex_user_history(recommend_pool,"id",user_history_input,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','Description','Movie Name','cosine_sim']]
search_result

In [None]:
gts = user_history['user_6']['GT']
gt_movie_names = [tpl[1] for tpl in gts]
pred_movie_names = search_result['Movie Name'].to_list()

count = 0
for pred in pred_movie_names:
    if pred in gt_movie_names:
        count += 1
        print(pred)



### Manual User history

In [None]:
# manually input your movie preferences !

user_history_input = [
    "animation",
    "fantasy",
    "pixar",
    "ghibli"
    ]


In [None]:
search_result, _=searchFAISSIndex_user_history(recommend_pool,"id",user_history_input,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','Description','Movie Name','cosine_sim']]
search_result

In [None]:
user_history_input = [
    "science",
    "alien",
    "spaceship",
    "portal"
    ]

In [None]:
search_result, _=searchFAISSIndex_user_history(recommend_pool,"id",user_history_input,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','Description','Movie Name','cosine_sim']]
search_result