In [1]:
# Installs
%pip install langchain-community langchain-core langchain_experimental langchain-google-community langchain-google-genai langchain-huggingface==0.0.3 langchain-openai langchain-qdrant langchain-google-vertexai
%pip install docx2txt
%pip install google-cloud-aiplatform
%pip install google-cloud-discoveryengine
%pip install nltk
%pip install openpyxl
%pip install pymupdf
%pip install python-dotenv
%pip install ragas==0.1.20 
%pip install tqdm

# Verify installed packages have compatible dependencies
%pip check

Collecting langchain-google-community
  Downloading langchain_google_community-2.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-google-vertexai
  Downloading langchain_google_vertexai-2.0.1-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
INFO: pip is looking at multiple versions of langchain-experimental to determine which version is compatible with other requirements. This could take a while.
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.1.post1-py3-none-any.whl.metadata (1.7 kB)
  Downloading langchain_experimental-0.3.1-py3-none-any.whl.metadata (1.7 kB)
  Using cached langchain_experimental-0.3.0-py3-none-any.whl.metadata (1.7 kB)
  Using cached langchain_experimental-0.0.65-py3-none-any.whl.metadata (1.7 kB)
Collecting google-api-core<3.

In [5]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")

In [2]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from datetime import datetime, timedelta

documents = []

for i in range(1, 5):
  loader = CSVLoader(
      file_path=f"john_wick_{i}.csv",
      metadata_columns=["Review_Date", "Review_Title", "Review_Url", "Author", "Rating"]
  )

  movie_docs = loader.load()
  for doc in movie_docs:

    # Add the "Movie Title" (John Wick 1, 2, ...)
    doc.metadata["Movie_Title"] = f"John Wick {i}"

    # convert "Rating" to an `int`, if no rating is provided - assume 0 rating
    doc.metadata["Rating"] = int(doc.metadata["Rating"]) if doc.metadata["Rating"] else 0

    # newer movies have a more recent "last_accessed_at"
    doc.metadata["last_accessed_at"] = datetime.now() - timedelta(days=4-i)

  documents.extend(movie_docs)

In [3]:
print(len(documents))

100


In [6]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(generator_llm, critic_llm, embeddings)

qa_distribution = {simple: 0.5, multi_context: 0.4, reasoning: 0.1}

num_qa_pairs = 20

synthetic_testset = generator.generate_with_langchain_docs(documents, num_qa_pairs, qa_distribution)

synthetic_testset_df = synthetic_testset.to_pandas()
print("Writing synthetic_testset.csv")
synthetic_testset_df.to_csv("synthetic_testset.csv")

embedding nodes:   0%|          | 0/200 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

max retries exceeded for SimpleEvolution(generator_llm=LangchainLLMWrapper(run_config=RunConfig(timeout=180, max_retries=15, max_wait=90, max_workers=16, exception_types=<class 'openai.RateLimitError'>, log_tenacity=False, seed=42)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x144d257d0>, nodes=[Node(metadata={'source': 'john_wick_1.csv', 'row': 0, 'Review_Date': '6 May 2015', 'Review_Title': ' Kinetic, concise, and stylish; John Wick kicks ass.\n', 'Review_Url': '/review/rw3233896/?ref_=tt_urv', 'Author': 'lnvicta', 'Rating': 8, 'Movie_Title': 'John Wick 1', 'last_accessed_at': datetime.datetime(2024, 9, 27, 13, 9, 6, 15262)}, page_content=": 0\nReview: The best way I can describe John Wick is to picture Taken but instead of Liam Neeson it's Keanu Reeves and instead of his daughter it's his dog. That's essentially the plot of the movie. John Wick (Reeves) is out to seek revenge on the people who took something he loved from him.

Writing synthetic_testset.csv
