In [72]:
"""
This is what is happening
- Using the Netflix dataset from Kaggle, having features like name, type, genre, cast etc...
- All the information for a movie is converted into a string, and then using a LLM(Nomic-embed in this case), it is embedded into a
  higher dimensional vector
- These vectors are the stored in a FAISS index, which can then be queried  with an embedding of a new movie which will give movies with vectors
closest to that embedding
- This works because similar movies are closer to each other, hence having close values in their respective
  vectors as well

*Embedding - converting textual or other types of data into higher dimensional vectors to capture semantic context 

What happened
1. Pulled the desired model from ollama onto local machine
2. Imported the dataset and created textual representations of all the movies in the dataset as a separate column
3. Stored these representations in a FAISS vector index, for later searching and querying for similar movies
4. Given an embedding of a textual representation of some another movie, it gives the k nearest embeddings, i.e similar movies

Now the problem with newer movies is that they don't have textual representations already, so they need some sort of func to create
these representations, without which we cannot crate embeddings and query the vector store
"""

"\nThis is what is happening\n- Using a dataset from Kaggle, having features like name, type, genre, cast etc...\n- All the information for a movie is converted into a string, and then using a LLM(Nomic-embed in this case), it is embedded into a\n  higher dimensional vector\n- These vectors are the stored in a FAISS index, which can then be queried  with an embedding of a new movie which will give movies with vectors\nclosest to that embedding\n- This works because similar movies are closer to each other, hence having close values in their respective\n  vectors as well\n\n*Embedding - converting textual or other types of data into higher dimensional vectors to capture semantic context \n\nWhat happened\n1. Pulled the desired model from ollama onto local machine\n2. Imported the dataset and created textual representations of all the movies in the dataset as a separate column\n3. Stored these representations in a FAISS vector index, for later searching and querying for similar movies\n4.

In [1]:
import pandas as pd
import numpy as np
import faiss
import requests
import ollama

In [2]:
df = pd.read_csv("./netflix_titles.csv")

In [3]:
df.axes

[RangeIndex(start=0, stop=8807, step=1),
 Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
        'release_year', 'rating', 'duration', 'listed_in', 'description'],
       dtype='object')]

In [4]:
textual_representations = [] # list of strings containing the string representations of the movies

def create_textual_repr(row: pd.Series):
    # will convert a series object, i.e a single row from the dataframe 
    # to a textual repersentaion like so
    # {title: title,
    # name: name, 
    # dir: director,
    # ...}
    textual_representation = f"""Type: {row['type']},
Title : {row['title']},
Director: {row['director']},
Cast: {row['cast']},
Released: {row['release_year']},
Genre(s) : {row['listed_in']},
Description: {row['description']}"""

    return textual_representation

df['textual_representation'] = df.apply(create_textual_repr, axis=1)

In [6]:
dim = 4096
index = faiss.IndexFlatL2(dim)
X = np.zeros((len(df['textual_representation']),dim), dtype='float32')

for i, text_repr in enumerate(df['textual_representation']):
    if i%200==0:
        print(f"Processed {i} requests")
    
    res = requests.post("http://localhost:11434/api/embeddings",
                        json={
                            'model': 'llama-7b:latest',
                            'prompt': text_repr,
                        })
    if(res.status_code == 200):
        embedding = res.json()['embedding']
        X[i] = np.array(embedding, dtype='float32')
    else:
        print("There was some error is connecting with the given URL")
   
    # embedding = ollama.embeddings(model='nomic-embed-text', prompt=text_repr)
    # X[i] = embedding.embedding
index.add(X)

Processed 0 requests


KeyboardInterrupt: 

In [17]:
faiss.write_index(index, 'vector_index')

In [7]:
index = faiss.read_index('vector_index')

In [17]:
import random
movie = df.iloc[random.randint(0,len(df)-1)]

embedding = np.array(ollama.embeddings(model='nomic-embed-text', prompt=movie['textual_representation']).embedding, dtype='float32')

embedding = embedding.reshape(1,768)
D, I = index.search(embedding, 6)
I = I.flatten()

print(movie['title'])
print()
for x in range(1,len(I)):
    print(df.iloc[I[x]]['title'])

Catwalk: Tales from the Cat Show Circuit

#cats_the_mewvie
Kitty Love: An Homage to Cats
Cat People
Pick of the Litter
Pets United


In [7]:
"""
Now what can we do:
Remove the queried movie from the results if it is there
Taking a movie from the user generate the remaining of the textual representation using a LLM or the API from TMDB/OMDB

'{Mock user prompt}
movie_name 
{LLM prompt}
Here's the format of the textual representation for creating embeddings for movies
f\"""Type: {row['type']},
Title : {row['title']},
Director: {row['director']},
Cast: {row['cast']},
Released: {row['release_year']},
Genre(s) : {row['listed_in']},
Description: {row['description']}\"""

Now for this, we'll use the TMDB/OMDb API for obtaining the movie information and creating the textual repreentation inside of a function
which will then be fed to nomic-embed-text for creating embedding and then finally query our vector index for nearest neighbours and then insert the movie if it is
not already there
""" 

'\nNow what can we do:\nRemove the queried movie from the results if it is there\nTaking a movie from the user generate the remaining of the textual representation using a LLM or the API from TMDB/OMDB\n\n\'{Mock user prompt}\nmovie_name \n{LLM prompt}\nHere\'s the format of the textual representation for creating embeddings for movies\nf"""Type: {row[\'type\']},\nTitle : {row[\'title\']},\nDirector: {row[\'director\']},\nCast: {row[\'cast\']},\nReleased: {row[\'release_year\']},\nGenre(s) : {row[\'listed_in\']},\nDescription: {row[\'description\']}"""\n\nNow for this, we\'ll use the TMDB/OMDb API for obtaining the movie information and creating the textual repreentation inside of a function\nwhich will then be fed to nomic-embed-text for creating embedding and then finally query our vector index and insert the movie if it is\nnot already there\n'

In [19]:
user_movie = input().strip().lower()

params = {
    't': user_movie,
}
res = requests.get(url="http://www.omdbapi.com/?apikey=3afaf5de", params=params)

omdb_movie = res.json()
omdb_movie

{'Title': 'Irreversible',
 'Year': '2002',
 'Rated': 'Not Rated',
 'Released': '22 May 2002',
 'Runtime': '97 min',
 'Genre': 'Crime, Drama, Mystery',
 'Director': 'Gaspar Noé',
 'Writer': 'Gaspar Noé',
 'Actors': 'Monica Bellucci, Vincent Cassel, Albert Dupontel',
 'Plot': 'Events over the course of one traumatic night in Paris unfold in reverse-chronological order.',
 'Language': 'French, Spanish, Italian, English',
 'Country': 'France',
 'Awards': '3 wins & 13 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BZTE2MWM1NTgtMGVjMy00ZGU2LWE4YTUtNTRlNWRhZWE2NmM0XkEyXkFqcGc@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.3/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '59%'},
  {'Source': 'Metacritic', 'Value': '51/100'}],
 'Metascore': '51',
 'imdbRating': '7.3',
 'imdbVotes': '152,035',
 'imdbID': 'tt0290673',
 'Type': 'movie',
 'DVD': 'N/A',
 'BoxOffice': '$803,491',
 'Production': 'N/A',
 'Website': 'N/A',
 'Response': 'True'}

In [21]:
omdb_textual_representation = f"""Type: {omdb_movie['Type']},
Title : {omdb_movie['Title']},
Director: {omdb_movie['Director']},
Cast: {omdb_movie['Actors']},
Released: {omdb_movie['Year']},
Genre(s) : {omdb_movie['Genre']},
Description: {omdb_movie['Plot']}"""

embedding = np.array(list(ollama.embeddings(model='nomic-embed-text', prompt=omdb_textual_representation).embedding), dtype='float32')
embedding = embedding.reshape(1,768)
D, I = index.search(embedding, 11)

I = I.flatten()
print(user_movie)
print()
for x in range(1, len(I)):
    print(df.iloc[I[x]]['title'])

irreversible

Tarif de nuit
2 Alone in Paris
A Kind of Murder
The Da Vinci Code
In the Shadow of Iris
Babel
Plaire, aimer et courir vite
Rogue City
Stray Bullet
The World Is Yours


In [14]:
#Creating a vector index using all the movies in OMDB database
def create_OMDB_representation(movie):
    textual_representation = f"""Title': {movie['Title']},
 'Year': {movie['Year']},
 'Rated': {movie['Rated']},
 'Released': {movie['Released']},
 'Runtime': {movie['Runtime']},
 'Genre': {movie['Genre']},
 'Director': {movie['Director']},
 'Writer': {movie['Writer']},
 'Actors': {movie['Actors']},
 'Plot': {movie['Plot']},
 'Language': {movie['Language']},
 'Country': {movie['Country']},
 'Ratings': {movie['Ratings']},
 'Metascore': {movie['Metascore']},
 'imdbRating': {movie['imdbRating']},
 'imdbVotes': {movie['imdbVotes']},
 'Type': {movie['Type']},
"""
    return textual_representation

print(create_OMDB_representation(omdb_movie))

Title': Sonchiriya,
 'Year': 2019,
 'Rated': Not Rated,
 'Released': 01 Mar 2019,
 'Runtime': 143 min,
 'Genre': Action, Crime, Drama,
 'Director': Abhishek Chaubey,
 'Writer': Sudip Sharma, Abhishek Chaubey,
 'Actors': Sushant Singh Rajput, Bhumi Pednekar, Ashutosh Rana,
 'Plot': Set in the Chambal valley, the film follows the story of a legion of dreaded, warring dacoits who once terrorized the Indian heartlands.,
 'Language': Hindi,
 'Country': India,
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.9/10'}, {'Source': 'Rotten Tomatoes', 'Value': '82%'}],
 'Metascore': N/A,
 'imdbRating': 7.9,
 'imdbVotes': 13,166,
 'Type': movie,

