# **Importing the libraries**

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key="84df92fd-e815-4e78-8432-10c84d302891")
from groq import Groq
from sentence_transformers import SentenceTransformer
import spacy

In [None]:

# !python -m spacy download en_core_web_sm


#  Setting up
## 1. API keys (Groq & Pinecone) P.S - Since Open-AI API was a paid version, so I used Groq API with mixtral-8 7b llm which is free with very fast inference speed.
## 2.Initialising pinecone, groq, embeddings and spacy.
* I have used all-MiniLM-L6-v2 embedding model because its lightweight and yet very impressive performance. Storage and retrival with pinecone becomes compatible.

* I have used spacy for paragraph segmentation in splitting the text. I have tried to implement ynamic Chunk Sizing with Semantic Segmentation.

In [None]:
## Seting up API keys
os.environ['PINECONE_API_KEY'] = '4f4ce604-3786-40bf-a0b9-d9b439bd23f1'
os.environ['GROQ_API_KEY'] = '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'

# initialising pinecone
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

# setting groq client
groq_client = Groq(api_key=os.environ['GROQ_API_KEY'])

# initialise embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
nlp = spacy.load("en_core_web_sm")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Implementing semantic chunking
* Instead of splitting the text into fixed-size chunks, I tried to implement a dynamic chunking strategy based on semantic coherence. This approach would ensure that each chunk contains complete and related information, potentially improving the relevance of retrieved context.

In [None]:
def semantic_chunking(text, min_chunk_size=100, max_chunk_size=500):
    doc = nlp(text)
    sentences = list(doc.sents)
    chunks = []
    current_chunk = []
    current_chunk_size = 0

    for sentence in sentences:
        sentence_text = sentence.text.strip()
        sentence_length = len(sentence_text)

        if current_chunk_size + sentence_length > max_chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_chunk_size = 0

        current_chunk.append(sentence_text)
        current_chunk_size += sentence_length

        if current_chunk_size >= min_chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_chunk_size = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [None]:
def index_documents(documents):
    # Create Pinecone index
    if 'business-qa' not in pc.list_indexes().names():
        pc.create_index(
            name='business-qa',
            dimension=384,
            metric='cosine',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )

    index = pc.Index('business-qa')

    # Generate embeddings and index documents
    for i, doc in enumerate(documents):
        embedding = model.encode(doc).tolist()
        index.upsert(vectors=[(str(i), embedding, {'text': doc})])

# Function for searching the documents

In [None]:
def search_documents(query, k=3):
    index = pc.Index('business-qa')
    query_embedding = model.encode(query).tolist()
    results = index.query(vector=query_embedding, top_k=k, include_metadata=True)
    return [result.metadata['text'] for result in results.matches]

# Code for generating responses from the LLM. (Mixtral-8 7B in this case)

In [None]:
def generate_response(question, context):
    prompt = f"""
    Context: {context}

    Human: {question}

    Assistant: Based on the context provided, I'll answer the human's question to the best of my ability.
    """

    response = groq_client.chat.completions.create(
        model="mixtral-8x7b-32768",
        messages=[
            {"role": "system", "content": "You are a helpful assistant for a business company called Yardstick."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1024
    )

    return response.choices[0].message.content

In [None]:
def qa_bot(question):
    # Retrieve relevant documents
    relevant_docs = search_documents(question)

    # Combine retrieved documents into a single context
    context = " ".join(relevant_docs)

    # Generate response
    answer = generate_response(question, context)

    return answer

In [None]:
def load_text_chunks(file_path):
    """
    Loads text from a file and returns a list of semantically chunked text.

    Parameters:
    file_path (str): The path to the text file.

    Returns:
    list: A list of strings, each representing a semantically coherent chunk of text from the file.
    """
    with open(file_path, 'r') as file:
        text = file.read()

    return semantic_chunking(text)

In [None]:
file_path = 'Yardstick_text.txt'
documents = load_text_chunks(file_path)
index_documents(documents)

# The final chatbot

In [None]:
while True:
    question = input("User: ")
    if question.lower() == 'exit':
        break
    else:
        answer = qa_bot(question)
        print(f"Bot: {answer}")

User: hi
Bot: Hello! How can I assist you today regarding Yardstick, Ayodhaya GPT, SEO, Live AI, Sales, or ChatBot? If you have any questions or ideas related to the blog post "The Betrayal of a Vision: Elon Musk's Legal Battle with OpenAI and Sam Altman," I'm here to help. I can also provide information on the related posts, India's New AI Regulation or the competition in AI video generation with Haiper. Please feel free to ask anything!
User: tell me what kind of business do you do?
Bot: Yardstick is a business company that specializes in an interdisciplinary approach, leveraging cutting-edge technology to deliver high-value solutions across various use cases. A significant aspect of their offering is AI Integration, where they seamlessly integrate AI models into their clients' existing systems. This AI integration aims to enhance efficiency and drive business growth, ultimately adding value to their clients' operations.
User: exit
