In [1]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, NonNegativeInt
from typing import List
from random import sample 

First, let's create a loader and load reviews from tv-reviews.csv into memory

In [2]:
# TODO: load reviews from tv-reviews.csv
from langchain.document_loaders.csv_loader import CSVLoader
data = CSVLoader("./data/tv-reviews.csv").load()

Then, let's initialize our LLM

In [3]:
model_name = "gpt-3.5-turbo"
temperature = 0.0
llm = OpenAI(model_name=model_name, temperature=temperature, max_tokens=500)



Now, let's setup our parser and a template

In [4]:
class ReviewSentiment(BaseModel):
    positives: List[NonNegativeInt] = Field(
        description="index of a positive TV review, starting from 0"
    )
    negatives: List[NonNegativeInt] = Field(
        description="index of a negative TV review, starting from 0"
    )


parser = PydanticOutputParser(pydantic_object=ReviewSentiment)
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"positives": {"title": "Positives", "description": "index of a positive TV review, starting from 0", "type": "array", "items": {"type": "integer", "minimum": 0}}, "negatives": {"title": "Negatives", "description": "index of a negative TV review, starting from 0", "type": "array", "items": {"type": "integer", "minimum": 0}}}, "required": ["positives", "negatives"]}
```


In [5]:
# TODO: setup a template with partial and input variables
prompt = PromptTemplate(
    template="{question}\n{format_instructions}\nContext: {context}",
    input_variables=["question", "context"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

Pick 3 sample reviews to classify - LLMs have a limited context window they can work with. In later exercises, we'll see how to deal with that differently

In [6]:
sample(data, k=3)

[Document(page_content='TV Name: Imagix Pro\nReview Title: Easy Setup and Navigation\nReview Rating: 9\nReview Text: Setting up the Imagix Pro was a breeze. The instructions were clear and the TV guided me through the process smoothly. The interface is intuitive and easy to navigate. I love how seamless it is to switch between different apps and inputs. This TV has made my life so much simpler!', metadata={'source': './data/tv-reviews.csv', 'row': 4}),
 Document(page_content="TV Name: VisionMax Ultra\nReview Title: Disappointing Sound\nReview Rating: 5\nReview Text: While the picture quality of the VisionMax Ultra is exceptional, the sound quality falls short. The built-in speakers lack depth and the audio feels hollow. I had to connect external speakers to enjoy a fulfilling audio experience. It's a letdown considering the overall performance of the TV.", metadata={'source': './data/tv-reviews.csv', 'row': 11}),
 Document(page_content="TV Name: VisionMax Ultra\nReview Title: Immersive

In [7]:
# TODO: pick 3 random reviews and save them into reviews_to_classify variable
reviews_to_classify = sample(data, k=3)

# generate textual prompt from the prompt template
question = """
    Review TVs provided in the context. 
    Only use the reviews provided in this context, do not make up new reviews or use any existing information you know about these TVs. 
    If there are no positive or negative reviews, output an empty JSON array. 
"""
query = prompt.format(context = context, question = question)

In [8]:
question = """
    Review TVs provided in the context. 
    Only use the reviews provided in this context, do not make up new reviews or use any existing information you know about these TVs. 
    If there are no positive or negative reviews, output an empty JSON array. 
"""
context = "\n".join(review.page_content for review in reviews_to_classify)

query = prompt.format(context=context, question=question)
print(query)


    Review TVs provided in the context. 
    Only use the reviews provided in this context, do not make up new reviews or use any existing information you know about these TVs. 
    If there are no positive or negative reviews, output an empty JSON array. 

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"positives": {"title": "Positives", "description": "index of a positive TV review, starting from 0", "type": "array", "items": {"type": "integer", "minimum": 0}}, "negatives": {"title": "Negatives", "description": "index of a negative TV review, starting from 0", "type": "a

Finally, let's send our query to LLM and use the parser we setup to parse an output into a Python object

In [9]:
output = llm(query)
print(output)


  warn_deprecated(


{
    "positives": [1, 2],
    "negatives": [0]
}


In [10]:
result = parser.parse(output)
result

ReviewSentiment(positives=[1, 2], negatives=[0])

In [11]:
# TODO: query LLM, then parse output into the result variable
print("Positives:\n" + "\n".join([reviews_to_classify[i].page_content for i in result.positives]))

Positives:
TV Name: Imagix Pro
Review Title: Outstanding Value for Money
Review Rating: 9
Review Text: The Imagix Pro is a fantastic value for money. Considering its high-quality performance, impressive features, and sleek design, it offers more bang for the buck compared to other TVs in the market. I am extremely satisfied with my purchase.
TV Name: Imagix Pro
Review Title: Impressive Features
Review Rating: 8
Review Text: The Imagix Pro is packed with impressive features that enhance my viewing experience. The smart functionality allows me to easily stream my favorite shows and movies. The remote control is user-friendly and has convenient shortcuts. The slim design is sleek and fits perfectly in my living room. The only downside is that the sound could be better, but overall, I'm satisfied.


In [12]:
print(
    "Negatives:\n"
    + "\n".join([reviews_to_classify[i].page_content for i in result.negatives])
)

Negatives:
TV Name: VisionMax Ultra
Review Title: Insufficient HDMI Ports
Review Rating: 6
Review Text: One downside of the VisionMax Ultra is the limited number of HDMI ports. With the increasing number of HDMI devices, it's frustrating to constantly switch cables. I wish there were more ports to accommodate all my devices without the need for an HDMI switcher.


# Add semantic Search using RAG

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import LLMChain
from langchain.chains.question_answering import load_qa_chain

use a Text Splitter to split the documents into chunks

In [19]:
model_name = "gpt-3.5-turbo"
temperature = 0.0
llm = OpenAI(model_name=model_name, temperature=temperature, max_tokens=2000)



In [20]:
data = CSVLoader("./data/tv-reviews.csv").load()
text_splitter = CharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0
)

documents = text_splitter.split_documents(data)

In [21]:
len(documents)

20

Initialize your embeddings model

In [22]:
underlying_embeddings = OpenAIEmbeddings()

Populate your vector database with the chunks

In [23]:
db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [30]:
query = """
    Based on the reviews in the context, tell me what people liked about the picture quality.
    Make sure you do not paraphrase the reviews, and only use the information provided in the reviews.
    """
# find top 5 semantically similar documents to the query
docs = db.similarity_search(query, 5)

In [31]:
print(len(docs))

5


In [32]:
print(docs[0].page_content)

TV Name: Imagix Pro
Review Title: Amazing Picture Quality
Review Rating: 9
Review Text: I recently purchased the Imagix Pro and I am blown away by its picture quality. The colors are vibrant and the images are crystal clear. It feels like I'm watching movies in a theater! The sound is also impressive, creating a truly immersive experience. Highly recommended!


Query your LLM with the query and the top 5 documents

In [33]:
prompt = PromptTemplate(
    template="{query}\Context: {context}", input_variables=["query", "context"]
)

chain = load_qa_chain(llm, prompt=prompt, chain_type="stuff")
print(chain.run(input_documents=docs, query=query))

People liked the vibrant colors, crystal clear images, and unmatched clarity of the picture quality on the Imagix Pro TV. They mentioned that it felt like watching movies in a theater and that every detail was sharp and lifelike, enhancing their overall viewing experience.


Use rag chain

In [35]:
rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever()
    
)
print(rag.run(query))

People liked the vibrant colors, crystal clear images, and unmatched clarity of the picture quality on the Imagix Pro TV.
