Move Review AI with Datastax and Orchestrate

In [None]:
from dotenv import load_dotenv,find_dotenv

dotenv_path = find_dotenv(filename='.env')
print('Loading env from ' +dotenv_path)
load_dotenv(dotenv_path)

load the data from filesystem to the Astra database

In [None]:
import pandas as pd

DATA_PATH='../data/'

# read the datasets from CSV files
reviews_all = pd.read_csv(DATA_PATH + "rotten_tomatoes_movie_reviews.csv")
movies_all = pd.read_csv(DATA_PATH + "rotten_tomatoes_movies.csv")

# rename the id columns to more informative and useful names
reviews_all = reviews_all.rename(columns={"id": "reviewed_movie_id"})
movies_all = movies_all.rename(columns={"id": "movie_id"})

print("Data is loaded from CSV.")

In [None]:
import os
from langchain_astradb import AstraDBVectorStore
from langchain_ibm import WatsonxEmbeddings

embeddings = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    apikey=os.getenv('WATSONX_APIKEY'),
    project_id=os.getenv('WATSONX_PROJECT_ID')
)

COLLECTION = "movie_reviews_rotten_tomatoes"
vectorstore = AstraDBVectorStore(
    embedding=embeddings,
    collection_name=COLLECTION,
    pre_delete_collection=True,
)

In [None]:
# Here, we limit our dataset to the movies with the most reviews. This is simply
# to save data processing and loading time while testing things in this notebook.
N_TOP_MOVIES = 10
most_reviewed_movies = reviews_all["reviewed_movie_id"].value_counts()[:N_TOP_MOVIES]

most_reviewed_movies

# subset the data to only reviews and movies corresponding to the most reviewed movies
reviews_data = reviews_all[
    reviews_all["reviewed_movie_id"].isin(most_reviewed_movies.index)
]
movies_data = movies_all[movies_all["movie_id"].isin(most_reviewed_movies.index)]

In [None]:
from langchain_core.documents import Document

# Convert each movie review into a LangChain document
documents = []
# convert each movie into a LangChain document
for index, row in movies_data.iterrows():
    content = str(row["title"])
    metadata = row.fillna("").astype(str).to_dict()
    metadata["doc_type"] = "movie_info"
    document = Document(page_content=content, metadata=metadata)
    documents.append(document)


for index, row in reviews_data.iterrows():
    content = str(row["reviewText"])
    metadata = row.drop("reviewText").fillna("").astype(str).to_dict()
    metadata["doc_type"] = "movie_review"
    document = Document(page_content=content, metadata=metadata)
    documents.append(document)


# check the total number of documents
print("There are", len(documents), "total Documents")

In [None]:
# add documents to the store
vectorstore.add_documents(documents)

# NOTE: this may take some minutes to load many documents

In [None]:
from graph_retriever.strategies import Eager
from langchain_graph_retriever import GraphRetriever

retriever = GraphRetriever(
    store=vectorstore,
    edges=[("reviewed_movie_id", "movie_id")],
    strategy=Eager(start_k=10, adjacent_k=10, select_k=100, max_depth=1),
)

In [None]:
INITIAL_PROMPT_TEXT = "What are some good family movies?"
# INITIAL_PROMPT_TEXT = "What are some recommendations of exciting action movies?"
# INITIAL_PROMPT_TEXT = "What are some classic movies with amazing cinematography?"


# invoke the query
query_results = retriever.invoke(INITIAL_PROMPT_TEXT)

# print the raw retrieved results
for result in query_results:
    print(result.metadata["doc_type"], ": ", result.page_content)
    print(result.metadata)
    print()

In [None]:
# collect the movie info for each film retrieved
compiled_results = {}
for result in query_results:
    if result.metadata["doc_type"] == "movie_info":
        movie_id = result.metadata["movie_id"]
        movie_title = result.metadata["title"]
        compiled_results[movie_id] = {
            "movie_id": movie_id,
            "movie_title": movie_title,
            "reviews": {},
        }

# go through the results a second time, collecting the retreived reviews for
# each of the movies
for result in query_results:
    if result.metadata["doc_type"] == "movie_review":
        reviewed_movie_id = result.metadata["reviewed_movie_id"]
        review_id = result.metadata["reviewId"]
        review_text = result.page_content
        compiled_results[reviewed_movie_id]["reviews"][review_id] = review_text


# compile the retrieved movies and reviews into a string that we can pass to an
# LLM in an augmented prompt
formatted_text = ""
for movie_id, review_list in compiled_results.items():
    formatted_text += "\n\n Movie Title: "
    formatted_text += review_list["movie_title"]
    formatted_text += "\n Movie ID: "
    formatted_text += review_list["movie_id"]
    for review_id, review_text in review_list["reviews"].items():
        formatted_text += "\n Review: "
        formatted_text += review_text


print(formatted_text)