# Prepare the data, EDA, upsert, and perform check

### Import packages

In [12]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv,find_dotenv
from langchain.chains import LLMChain, RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains import StuffDocumentsChain
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.globals import set_verbose, set_debug
set_debug(True)
set_verbose(True)

In [2]:
load_dotenv(find_dotenv())

True

### Pinecone setup

In [3]:
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

index_name = "mekari-test"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="euclidean",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 
    
index = pc.Index(index_name)

### Prepare data

In [5]:
df=pd.read_csv("SPOTIFY_REVIEWS.csv")
df=df.drop("Unnamed: 0",axis=1)

### simple EDA

In [7]:
with pd.option_context('display.float_format', '{:,.0f}'.format):
    print(df['review_text'].astype(str).apply(len).describe(percentiles=[.25, .5, .75, .99]))

count   3,377,423
mean           67
std            93
min             1
25%            11
50%            30
75%            81
99%           476
max         3,753
Name: review_text, dtype: float64


### Exception Handling

In [47]:
#check review data type (suspicious having floats or integer instead string)
def count_types(lst):
    int_count = 0
    str_count = 0
    float_count = 0

    for element in lst:
        if isinstance(element, int):
            int_count += 1
        elif isinstance(element, str):
            str_count += 1
        elif isinstance(element, float):
            float_count += 1

    return int_count, str_count, float_count
int_count, str_count, float_count = count_types(df['review_text'].to_list())

print(f"Integers: {int_count}, Strings: {str_count}, Floats: {float_count}")

Integers: 0, Strings: 9965, Floats: 35


In [8]:
#Since 99% of review below 500 character, we limit review to be 500 character
max_length = 500

#Make sure all data is also string type
df['cut'] = df['review_text'].astype(str).apply(lambda text: text[:max_length])

In [9]:
#for example we only take 10000 first review
df=df.head(10000)

In [6]:
#embed using hugging face
embeddings = HuggingFaceEmbeddings()
df['embeddings']=df.apply(lambda row: embeddings.embed_query(row['cut']),axis=1)

  warn_deprecated(


In [5]:
#save embed data
df=pd.read_csv('embed_data.csv')

### Upsert

In [14]:
#prepare data for upsert
embeddings_list = df['embeddings'].apply(ast.literal_eval).tolist()
df['cut'] = df['cut'].astype(str)
documents = df['cut'].tolist()

In [16]:
# Prepare upsert batch
upsert_data = [(str(i), embeddings, {"text": doc}) for i, (embeddings, doc) in enumerate(zip(embeddings_list, documents))]

# Upsert data in batches
batch_size = 100
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch)
    print(f"Upserted batch {i // batch_size + 1}")

print("All data upserted.")

Upserted batch 1
Upserted batch 2
Upserted batch 3
Upserted batch 4
Upserted batch 5
Upserted batch 6
Upserted batch 7
Upserted batch 8
Upserted batch 9
Upserted batch 10
Upserted batch 11
Upserted batch 12
Upserted batch 13
Upserted batch 14
Upserted batch 15
Upserted batch 16
Upserted batch 17
Upserted batch 18
Upserted batch 19
Upserted batch 20
Upserted batch 21
Upserted batch 22
Upserted batch 23
Upserted batch 24
Upserted batch 25
Upserted batch 26
Upserted batch 27
Upserted batch 28
Upserted batch 29
Upserted batch 30
Upserted batch 31
Upserted batch 32
Upserted batch 33
Upserted batch 34
Upserted batch 35
Upserted batch 36
Upserted batch 37
Upserted batch 38
Upserted batch 39
Upserted batch 40
Upserted batch 41
Upserted batch 42
Upserted batch 43
Upserted batch 44
Upserted batch 45
Upserted batch 46
Upserted batch 47
Upserted batch 48
Upserted batch 49
Upserted batch 50
Upserted batch 51
Upserted batch 52
Upserted batch 53
Upserted batch 54
Upserted batch 55
Upserted batch 56
U

### Perform check on upsert, vector store, llm, and chain

In [26]:
#check if vector sucessfully upsert
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}


In [16]:
#Check using similairty search
#Generate embedding for the query text
query_text = "good features"
query_embedding = embeddings.embed_query(query_text)

#Perform a similarity search
response = index.query(
    namespace="",
    vector=query_embedding,
    top_k=3,
    include_values=True
)

#Print the results
print("Similarity search results:")
for match in response['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")

Similarity search results:
ID: id-8397, Score: 0.742148876
ID: id-9351, Score: 0.769828677
ID: id-1217, Score: 0.849857092


In [19]:
#Check LLM and key
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    google_api_key=os.environ.get("GOOGLE_API_KEY"),
    convert_system_message_to_human=True
)

In [5]:
llm.invoke("what is llm?")

AIMessage(content='**LLM stands for Large Language Model.**\n\n**What is a Large Language Model (LLM)?**\n\nA large language model is a type of artificial intelligence (AI) system that is trained on a massive dataset of text and code. LLMs are capable of understanding and generating human-like text in response to a wide range of prompts and questions.\n\n**Key Characteristics of LLMs:**\n\n* **Massive Size:** LLMs typically have billions or even trillions of parameters, making them significantly larger than traditional language models.\n* **Deep Learning:** They are trained using deep learning algorithms, specifically transformer networks, which allow them to capture complex patterns in language.\n* **Text Generation:** LLMs excel at generating coherent and grammatically correct text, including articles, stories, summaries, and conversations.\n* **Language Understanding:** They can understand the meaning and context of text, enabling them to perform tasks like question answering, senti

In [49]:
#check if vector stored in pinecone
vectorstore = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)
retriever = vectorstore.as_retriever()

In [40]:
custom_prompt_template = """
Thees are Document of Google Store reviews for a music streaming application (Spotify) sourced from various users. The management is currently
facing difficulties in extracting actionable insights from these reviews, Please answer this Question.

Document: {document}
Question: {question}
Answer:
"""

In [41]:
custom_prompt = PromptTemplate(
    input_variables=["document", "question"],
    template=custom_prompt_template
)

In [42]:
llm_chain = LLMChain(llm=llm, prompt=custom_prompt)
combine_documents_chain = StuffDocumentsChain(llm_chain=llm_chain,document_variable_name="document"
)

In [43]:
qa_chain = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=combine_documents_chain
)

In [44]:
query1 = """"What are the specific features or aspects that users appreciate the
most in our application?"""
query2 = """In comparison to our application, which music streaming platform are
users most likely to compare ours with"""

In [45]:
def print_query_and_answer(query):
    result = qa_chain.invoke(query).get("result")
    print(result)

print("Query 1\n")
print_query_and_answer(query1)

print("\nQuery 2\n")
print_query_and_answer(query2)

Query 1

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "\"What are the specific features or aspects that users appreciate the\nmost in our application?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "\"What are the specific features or aspects that users appreciate the\nmost in our application?",
  "document": "Everything I could ever want in an app.\n\nStreaming, personal list, and media ... quality ... best of all worlds\n\nGood app. Thats all\n\nToo many new features not requested by the user's.  Only poor app developers do that.  Listen to us, it will save you money researching, implementing then removing what we don't want."
}
[32;1m[1;3m[llm/start][0m [1m[chain:RetrievalQA > chain:StuffDocu

[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "In comparison to our application, which music streaming platform are\nusers most likely to compare ours with",
  "document": "Best Music Streaming Service\n\nNo better music service\n\nExactly what I've been looking for in a music streaming app\n\nThis is by far the best music app and its hard to say but its better than pandora"
}
[32;1m[1;3m[llm/start][0m [1m[chain:RetrievalQA > chain:StuffDocumentsChain > chain:LLMChain > llm:ChatGoogleGenerativeAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: \nThees are Document of Google Store reviews for a music streaming application (Spotify) sourced from various users. The management is currently\nfacing difficulties in extracting actionable insights