In [25]:
import pandas as pd
import requests
from io import StringIO

In [26]:
url="https://raw.githubusercontent.com/datum-oracle/netflix-movie-titles/main/titles.csv"

In [27]:
response=requests.get(url)

In [28]:
csv_data = StringIO(response.text)
df = pd.read_csv(csv_data)

In [29]:
print(df.head())

         id                                title   type  \
0  ts300399  Five Came Back: The Reference Films   SHOW   
1   tm82169                                Rocky  MOVIE   
2   tm17823                               Grease  MOVIE   
3  tm191099                            The Sting  MOVIE   
4   tm69975                             Rocky II  MOVIE   

                                         description  release_year  \
0  This collection includes 12 World War II-era p...          1945   
1  When world heavyweight boxing champion, Apollo...          1976   
2  Australian good girl Sandy and greaser Danny f...          1978   
3  A novice con man teams up with an acknowledged...          1973   
4  After Rocky goes the distance with champ Apoll...          1979   

  age_certification  runtime                                 genres  \
0             TV-MA       51                      ['documentation']   
1                PG      119                     ['drama', 'sport']   
2          

In [30]:
df.shape

(6137, 15)

In [31]:
# Select relevant columns, double-check for typos and existence in df.columns
selected_columns = ['title', 'type', 'description', 'genres', 'production_countries','imdb_score']  # Adjust column names if needed

# Combine these columns into a single text column
df['combined_text'] = df[selected_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [32]:
print(df[['id', 'combined_text']].head())

         id                                      combined_text
0  ts300399  Five Came Back: The Reference Films SHOW This ...
1   tm82169  Rocky MOVIE When world heavyweight boxing cham...
2   tm17823  Grease MOVIE Australian good girl Sandy and gr...
3  tm191099  The Sting MOVIE A novice con man teams up with...
4   tm69975  Rocky II MOVIE After Rocky goes the distance w...


In [33]:
movie_data_df = df

In [34]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,0
type,0
description,23
release_year,0
age_certification,2743
runtime,0
genres,0
production_countries,0
seasons,3831


In [35]:
df.duplicated().value_counts()

Unnamed: 0,count
False,6137


In [36]:
!pip show chromadb
!pip install --upgrade chromadb

Name: chromadb
Version: 0.5.5
Summary: Chroma.
Home-page: https://github.com/chroma-core/chroma
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, httpx, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pydantic, pypika, PyYAML, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 


In [37]:
pip install sentence_transformers



In [38]:
from transformers import AutoTokenizer, AutoModel
import torch

In [39]:
# Load pre-trained model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # A lightweight model for generating embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [40]:
# Function to generate embeddings
def generate_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [41]:
# Apply the function to generate embeddings for each row
movie_data_df['embeddings'] = movie_data_df['combined_text'].apply(generate_embeddings)

In [42]:
!pip install --upgrade chromadb  # Upgrade ChromaDB to the latest version

import chromadb
from chromadb.api.types import Embedding

# Function to convert embedding to dictionary (if needed)
def embedding_to_dict(embedding: Embedding) -> dict:
    return {"embedding": embedding}



In [44]:
import chromadb
from chromadb.api.types import Embedding

# Function to convert embedding to dictionary (if needed)
def embedding_to_dict(embedding: Embedding) -> dict:
    return {"embedding": embedding}

# Initialize ChromaDB client
client = chromadb.Client()

# Check if a collection named "movies" already exists and delete it if it does
if client.get_collection("movies"):
    client.delete_collection("movies")

# Create a new collection in ChromaDB
collection = client.create_collection("movies")
# Initialize ChromaDB client
# client = chromadb.Client()

# Create a new collection in ChromaDB
# collection = client.create_collection("movies")

In [45]:
# Index the embeddings along with the corresponding MovieID
for index, row in movie_data_df.iterrows():
    collection.add(
        ids=[str(row['id'])],  # Use MovieID as the document ID
        embeddings=[row['embeddings'].tolist()],  # Convert embeddings to list
        metadatas=[{
            'id': row['id'],
            'title': row['title'],
            'type': row['type'],
            'genres': row['genres'],
            'production_countries': row['production_countries'],
            'combined_text': row['combined_text']
        }]  # Optionally store other metadata
    )

In [46]:
query = "A romantic movie with a twisted lovestory"
query_embedding = generate_embeddings(query)

# Perform the search
# Convert query_embedding to a list before passing to query()
results = collection.query(
    query_embeddings=[query_embedding.tolist()],  # Convert to list
    n_results=5  # Number of results to return
)

# Display the results
for result in results:
    print(result)


ids
distances
metadatas
embeddings
documents
uris
data
included


In [47]:
# Assuming results is the output from the ChromaDB query
for i in range(len(results['distances'])):
    print(f"Result {i + 1}:")
    print(f"  Distance: {results['distances'][i]}")
    print(f"  Title: {results['metadatas'][i][0]['title']}") # Access title from nested dictionary
    print(f"  Type: {results['metadatas'][i][0]['type']}") # Access type from nested dictionary
    print(f"  Genres: {results['metadatas'][i][0]['genres']}") # Access genres from nested dictionary
    print(f"  Countries: {results['metadatas'][i][0]['production_countries']}") # Access production_countries from nested dictionary
    print(f"  Description: {results['metadatas'][i][0]['combined_text']}") # Access combined_text from nested dictionary
    print()  # For better readability

Result 1:
  Distance: [20.284456253051758, 20.732263565063477, 20.936193466186523, 21.225093841552734, 21.524471282958984]
  Title: Love
  Type: MOVIE
  Genres: ['thriller', 'drama']
  Countries: ['IN']
  Description: Love MOVIE The story of a family and the various situations navigated by a husband and wife. ['thriller', 'drama'] ['IN'] 7.0



In [48]:
import pickle

# Save embeddings and metadata to a pickle file
with open('movie_semantic.pkl', 'wb') as f:
    pickle.dump(movie_data_df[['id', 'title', 'type', 'genres', 'production_countries', 'combined_text', 'embeddings']], f)



In [49]:
from google.colab import files

# Download the pickle file
files.download('movie_semantic.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>