In [None]:
url="redis_url"

host="redis_host"
password="redis_password"
port=8888 #replace_with_redis_port

import gzip
import json
import pandas as pd
import numpy as np

import redis
from langchain.vectorstores.redis import Redis


r = redis.Redis(
  host=host,
  port=port,
  password=password)

In [None]:
def get_data(asin):
    data = []
    with gzip.open('AMAZON_FASHION.json.gz') as f:
        for l in f:
            data.append(json.loads(l.strip()))

    df = pd.DataFrame.from_dict(data)
    df = df[df['reviewText'].notna()]
    df = df.loc[df['asin'] == asin].copy()
    return df

df=get_data('B000KPIHQ4').reset_index()[['overall','asin','reviewText','summary','reviewerID']]

max_text_length=400
def truncate_review(text):
    return text[:max_text_length]
df['reviewText']=df.apply(lambda row: truncate_review(row['reviewText']),axis=1)

df['overall']=df.apply(lambda row: int(row['overall']),axis=1)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

In [None]:
reviews=df.head(100).to_dict(orient='index')

texts=df.head(100)['reviewText'].to_list()
vectors=embeddings.embed_documents(texts)

summaries=df.head(100)['summary'].to_list()
summary_vectors=embeddings.embed_documents(summaries)

#vstore = Redis.from_texts(texts, embeddings, redis_url=url,  index_name='reviewidx')

#vstore_meta = Redis.from_texts(texts, embeddings, redis_url=url, index_name='reviewidx_meta',metadatas=metadata)

In [None]:
len(vectors[0])

In [None]:
r.execute_command('FT._LIST')

In [None]:
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField
from redis.commands.search.field import NumericField

from redis.commands.search.indexDefinition import IndexDefinition, IndexType

from redis.commands.search.query import Query

In [None]:
pipe = r.pipeline(transaction=False)

In [None]:
schema = (
    TextField("overall"),
    TextField("reviewText"),
    TextField("summary"),
    VectorField("vector", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"}),
)

In [None]:
prefix="doc:reviewidx_hash"

r.ft("reviewidx_hashed").create_index(fields=schema,
                                      definition=IndexDefinition(prefix=[prefix],index_type=IndexType.HASH)
                                     )

In [None]:
for i in reviews.keys():
    key=prefix+':' + reviews[i]['reviewerID']
    record = reviews[i]
    record['vector']=np.array(vectors[i]).astype(np.float32).tobytes()
    pipe.hset(key,mapping=record)
pipe.execute()

In [None]:
r.execute_command('FT._LIST')

In [None]:
prefix="doc:reviewidx_json"

schema = (TextField("$.asin", as_name="asin"),
          TextField("$.reviewText", as_name="reviewText"),
          TextField("$.reviewerID", as_name="reviewerID"),
          TextField("$.summary", as_name="summary"),
          NumericField("$.overall", as_name="overall"),
          VectorField("$.vector", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"},
                      as_name="vector")
         )

In [None]:
r.ft("reviewidx_json").create_index(schema, definition=IndexDefinition(prefix=[prefix], index_type=IndexType.JSON))

In [None]:
for i in reviews.keys():
    key=prefix+':' + reviews[i]['reviewerID']
    record = reviews[i]
    record['vector']=vectors[i]
    pipe.json().set(key, '$', record)
pipe.execute()

In [None]:
r.execute_command('FT._LIST')

In [None]:
r.json().get('doc:reviewidx_json:A2YBAAGNFYJHFJ','$')

In [None]:
prefix="doc:reviewidx_json_multi"

schema = (TextField("$.asin", as_name="asin"),
          TextField("$.reviewText", as_name="reviewText"),
          TextField("$.reviewerID", as_name="reviewerID"),
          TextField("$.summary", as_name="summary"),
          NumericField("$.overall", as_name="overall"),
          VectorField("$.vectors[*]", "HNSW", {"TYPE": "FLOAT32", "DIM": 768, "DISTANCE_METRIC": "COSINE"},
                      as_name="vector")
         )

In [None]:
r.ft("reviewidx_json_multi").create_index(schema,
                                          definition=IndexDefinition(prefix=[prefix], index_type=IndexType.JSON))

In [None]:
for i in reviews.keys():
    key=prefix+':' + reviews[i]['reviewerID']
    record = reviews[i]
    record['vectors']=[vectors[i],summary_vectors[i]]
    pipe.json().set(key, '$', record)
pipe.execute()

In [None]:
from redis.commands.json.path import Path

In [None]:
r.json().get('doc:reviewidx_json_multi:A1BDZBAMHAN1G3', '$')

In [None]:
query_syntax = "*=>[KNN 5 @vector $vec_param AS vector_score]"

vss_query=Query(query_syntax).return_fields("overall", "vector_score").sort_by("vector_score").dialect(2)

query_string="Very uncomfortable"
embedded_query=np.array(embeddings.embed_documents([query_string])).astype(np.float32).tobytes()

params_dict = {"vec_param": embedded_query}

vss_results = r.ft('reviewidx_json_multi').search(vss_query, query_params = params_dict)

In [None]:
vss_results

In [None]:
query_syntax = "(@overall:3)=>[KNN 5 @vector $vec_param AS vector_score]"

vss_query=Query(query_syntax).return_fields("overall", "vector_score").sort_by("vector_score").dialect(2)

query_string="Very uncomfortable"
embedded_query=np.array(embeddings.embed_documents([query_string])).astype(np.float32).tobytes()

params_dict = {"vec_param": embedded_query}

vss_results = r.ft('reviewidx_hashed').search(vss_query, query_params = params_dict)

In [None]:
vss_results

In [None]:
from dotenv import load_dotenv,find_dotenv
load_dotenv()

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI

from langchain.chains import RetrievalQA
from langchain.chains.summarize import load_summarize_chain

chat = ChatOpenAI(model_name="gpt-4",temperature=0.2)

In [None]:
vstore = Redis.from_existing_index(index_name='reviewidx', embedding=embeddings,redis_url=url)

In [None]:
review_chain = RetrievalQA.from_chain_type(llm=chat, chain_type="stuff", retriever=vstore.as_retriever())

In [None]:
q="""
The reviews you see are for a product called 'Powerstep Pinnacle Orthotic Shoe Insoles'.
What is the overall impression of these reviews? Give most prevalent examples in bullets.
What do you suggest we focus on improving?
"""

result=review_chain.run(q)

In [None]:
print(result)

In [None]:
from langchain.document_loaders.base import Document

In [None]:
query_syntax = "*=>[KNN 5 @vector $vec_param AS vector_score]"

vss_query=Query(query_syntax).return_fields("overall", "vector_score","reviewText","summary").dialect(2)

query_string="Very uncomfortable"
embedded_query=np.array(embeddings.embed_documents([query_string])).astype(np.float32).tobytes()

params_dict = {"vec_param": embedded_query}

vss_results = r.ft('reviewidx_hashed').search(vss_query, query_params = params_dict)

In [None]:
vss_results

In [None]:
docs=[]
for review in vss_results.docs:
    result_string = ''
    result_string += " review score:" + review.overall + ' ' + review.reviewText + ' ' + review.summary
    docs.append(Document(page_content=result_string))

In [None]:
docs

In [None]:
prompt_template_summary = """
Write a summary of the reviews:

{text}

The summary should be about five lines long
"""
PROMPT = PromptTemplate(template=prompt_template_summary, input_variables=["text"])
chain = load_summarize_chain(chat, chain_type="stuff", prompt=PROMPT)
summary=chain.run(docs)

In [None]:
print(summary)