In [1]:
import numpy as np
import pandas as pd


In [2]:
# !pip install -U sentence-transformers

In [3]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [4]:
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

def stemSentence_porter(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 


def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 


def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [5]:
movies = pd.read_csv('tmdb_5000/tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000/tmdb_5000_credits.csv') 
movies = movies.merge(credits,on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)

movies['keywords'] = movies['keywords'].apply(convert)

movies['cast'] = movies['cast'].apply(convert)

movies['cast'] = movies['cast'].apply(lambda x:x[0:3])

movies['crew'] = movies['crew'].apply(fetch_director)


# movies['cast'] = movies['cast'].apply(collapse)
# movies['crew'] = movies['crew'].apply(collapse)
# movies['genres'] = movies['genres'].apply(collapse)
# movies['keywords'] = movies['keywords'].apply(collapse)

movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new = new[(new['tags'].notnull())].reset_index(drop=True)
# new['tags'] = new['tags'].apply(stemSentence_porter)

In [6]:
descriptions = new['tags'].tolist()
# print(descriptions)
des_embeddings = []
for i,des in enumerate(descriptions):
    des_embeddings.append(model.encode(des))

In [13]:
len(des_embeddings[256])

768

In [7]:
print(new[new['title']=='Avatar']['tags'].values)

['In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver James Cameron']


In [60]:
import torch
from sentence_transformers import SentenceTransformer, util

def recommend(query):
    #Compute cosine-similarities with all embeddings 
    query_embedd = model.encode(query)
    cosine_scores = util.pytorch_cos_sim(query_embedd, des_embeddings)
    top5_matches = torch.argsort(cosine_scores, dim=-1, descending=True).tolist()[0][0:5]
    return top5_matches

# title = 'Avatar'
# query_show_des = new.loc[new['title'] == title]['tags'].to_list()[0]
query_show_des = 'moon Pandora space colony society Sam Worthington 3d'
recommendded_results = recommend(query_show_des)

for index in recommendded_results:
    print(new.iloc[index,:])

movie_id                                                 9355
title                              Mad Max Beyond Thunderdome
tags        Mad Max becomes a pawn in a decadent oasis of ...
Name: 2999, dtype: object
movie_id                                                 6552
title                                              Idle Hands
tags        Anton is a cheerful but exceedingly non-ambiti...
Name: 2632, dtype: object
movie_id                                                 9549
title                                         The Right Stuff
tags        A chronicle of the original Mercury astronauts...
Name: 1765, dtype: object
movie_id                                                  811
title                                          Silent Running
tags        In a future Earth barren of all flora and faun...
Name: 4336, dtype: object
movie_id                                               323967
title                                                  Walter
tags        A ticket-taker a

In [37]:
print(new[new['title']=='Battleship']['tags'].values)

["When mankind beams a radio signal into space, a reply comes from ‘Planet G’, in the form of several alien crafts that splash down in the waters off Hawaii. Lieutenant Alex Hopper is a weapons officer assigned to the USS John Paul Jones, part of an international naval coalition which becomes the world's last hope for survival as they engage the hostile alien force of unimaginable strength. While taking on the invaders, Hopper must also try to live up to the potential his brother, and his fiancée's father, Admiral Shane, expect of him. Thriller Action Adventure ScienceFiction fight u.s.navy mindreading hongkong soccer scientist fictionalwar naval armada battleship navalcombat jdsmyoko lostcommunication taser buoy communicationsexpert jointchiefsofstaff crashlanding jetfighterpilot navylieutenant permissiontomarry ussjohnpauljones basedonboardgame aftercreditsstinger mightymo ussmissouri TaylorKitsch AlexanderSkarsgård Rihanna PeterBerg"]


In [22]:
import pickle
import openai
from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)
openai.api_key = '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'


In [23]:
# establish a cache of embeddings to avoid recomputing
# cache is a dict of tuples (text, engine) -> embedding, saved as a pickle file

# set path to embedding cache
embedding_cache_path_to_load = "https://cdn.openai.com/API/examples/data/example_embeddings_cache.pkl"
embedding_cache_path_to_save = "example_embeddings_cache.pkl"

# load the cache if it exists, and save a copy to disk
try:
    embedding_cache = pd.read_pickle(embedding_cache_path_to_load)
except FileNotFoundError:
    embedding_cache = {}
with open(embedding_cache_path_to_save, "wb") as embedding_cache_file:
    pickle.dump(embedding_cache, embedding_cache_file)

# define a function to retrieve embeddings from the cache if present, and otherwise request via the API
def embedding_from_string(
    string: str,
    engine: str = "text-similarity-babbage-001",
    embedding_cache=embedding_cache
):
    """Return embedding of given string, using a cache to avoid recomputing."""
    if (string, engine) not in embedding_cache.keys():
        embedding_cache[(string, engine)] = get_embedding(string, engine)
        print('NOT FOUND')
        with open(embedding_cache_path_to_save, "wb") as embedding_cache_file:
            pickle.dump(embedding_cache, embedding_cache_file)
    return embedding_cache[(string, engine)]

In [24]:

# as an example, take the first description from the dataset
example_string = new["tags"].values[0]
print(f"\nExample string: {example_string}")

# print the first 10 dimensions of the embedding
example_embedding = embedding_from_string(example_string, engine="text-similarity-babbage-001")
print(f"\nExample embedding: {example_embedding}...")


Example string: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d Sam Worthington Zoe Saldana Sigourney Weaver James Cameron
NOT FOUND

Example embedding: [-0.00922426488250494, 0.016646193340420723, 0.007269423454999924, -0.019058551639318466, 0.02885586768388748, -0.0056380778551101685, -0.021073471754789352, 0.0006562353228218853, 0.033218447118997574, -0.030464107170701027, 0.015703434124588966, 0.005074270069599152, 0.032811764627695084, 0.009982170537114143, -0.01430777832865715, 0.00754670612514019, 0.013559115119278431, 0.008156727999448776, -0.011220699176192284, 0.03242357075214386, 0.08880434930324554, 0.00327193387784

In [27]:
len(example_embedding)

TypeError: object of type 'float' has no len()