In [None]:
# Install the required libraries
# Note: After running this code, the kernel needs to be restarted.

!pip install \
    chromadb==0.5.5 \
    langchain-chroma==0.1.2 \
    langchain==0.2.11 \
    langchain-community==0.2.10 \
    langchain-text-splitters==0.2.2 \
    langchain-groq==0.1.6 \
    transformers==4.43.2 \
    sentence-transformers==3.0.1 \
    unstructured==0.15.0 \
    "unstructured[pdf]==0.15.0"


Collecting chromadb==0.5.5
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-chroma==0.1.2
  Downloading langchain_chroma-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain==0.2.11
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community==0.2.10
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-text-splitters==0.2.2
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain-groq==0.1.6
  Downloading langchain_groq-0.1.6-py3-none-any.whl.metadata (2.8 kB)
Collecting transformers==4.43.2
  Downloading transformers-4.43.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers==3.0.1
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting unstructured=

In [None]:
import os
import pandas as pd

# LangChain imports
from langchain.document_loaders import UnstructuredFileLoader
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA

# LangChain Community imports
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma

# GROQ and Chroma imports
from langchain_chroma import Chroma
from langchain_groq import ChatGroq

# Set API keys
os.environ["GROQ_API_KEY"] = "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"
# Hugging Face Secret Key (if needed)
# os.environ["HUGGINGFACE_API_KEY"] = "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"
#pinecone API : pcsk_57E51r_4HXHZF1zmY2KviiN9bbxpVmLSgPKUHgVU6NZvZaZQyfyd32hSDXAbQdGCSqyfVU




# Initializing TPU
Initialize TPU is very important because running on CPU would consume a lot and lot of time.

In [None]:
import tensorflow as tf
print("GPU Available:", tf.config.list_physical_devices('GPU'))


GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='local')  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict())
  TPU_ADDRESS = tpu.get_master()
  print('Running on TPU:', TPU_ADDRESS)
except ValueError:
  raise BaseException(
    'ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

Running on TPU  {}
Running on TPU: 


# converting to .txt file
First of all we have to convert the file to txt so that it becomes easy for transfering data into vector database. 4 hours of process takes 15-20 minutes or even less.

In [None]:

#This is to  convert .pdf file to .txt file
import pdfplumber

# Specify the PDF file path
file_path = "/content/IJOR-024-44535117536-7545-1.pdf"

# Initialize a list to hold extracted text chunks
text_chunks = []

# Open and extract text from the PDF file
with pdfplumber.open(file_path) as pdf:
    for page in pdf.pages:
        # Extract text from the current page
        page_text = page.extract_text()
        if page_text:
            text_chunks.append(page_text)

# Join all text chunks into a single string
full_text = "\n".join(text_chunks)

# Write the text to a .txt file
with open("data_output_book.txt", "w") as txt_file:
    txt_file.write(full_text)

print("The text has been successfully written to data_output.txt")

In [None]:

# This is to convert .csv file to .txt file
# Read the CSV file
file_path = "/content/all_data_2001.csv"
csv_data = pd.read_csv(file_path)

# You can choose a specific column, or iterate through all rows
text_chunks = []
for index, row in csv_data.iterrows():
    # Combine all columns into one string for each row (adjust as necessary)
    text_row = " ".join(str(value) for value in row.values)
    text_chunks.append(text_row)

# Join all text chunks into a single string
full_text = "\n".join(text_chunks)

# Write the text to a .txt file
with open("data_output.txt", "w") as txt_file:
    txt_file.write(full_text)

print("The text has been successfully written to data_output.txt")


The text has been successfully written to data_output.txt


In [None]:
"""
from langchain_community.document_loaders.csv_loader import CSVLoader
file_path = "/content/all_data_with_vin.csv"
loader = CSVLoader(file_path)
documents = loader.load_and_split()
"""
file_path = "/content/data_output.txt"
loader = TextLoader(file_path)
documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )

#text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) - This is method 1
text_chunks = text_splitter.split_documents(documents)
print(f"Number of chunks: {len(text_chunks)}")

Number of chunks: 14590


In [None]:
persist_directory = "all_data_2001"

In [None]:
# Download Sentence Transformers Embedding From Hugging Face
import torch
from sentence_transformers import SentenceTransformer
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L12-v2')
#embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
#embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/average_word_embeddings_komninos') #Best


  from tqdm.autonotebook import tqdm, trange
  embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L12-v2')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
vectordb = Chroma.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    persist_directory=persist_directory
)


'\n# Install necessary libraries\n!pip install chromadb==0.5.5 langchain==0.2.11 transformers==4.43.2 sentence-transformers==3.0.1 faiss-cpu\nimport os\nfrom sentence_transformers import SentenceTransformer\nfrom langchain.document_loaders import TextLoader\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom langchain.vectorstores import FAISS\nfrom langchain.chains import RetrievalQA\nfrom langchain.llms import HuggingFacePipeline\nfrom transformers import pipeline\n\n# Embed documents and initialize FAISS\nvectordb = FAISS.from_documents(\n    documents=text_chunks,\n    embedding=embeddings,\n    #normalize_embeddings=True\n)\nvectordb.save_local(persist_directory)\n\n# Reload FAISS vector store\nvectordb = FAISS.load_local(persist_directory, embeddings,allow_dangerous_deserialization=True)\n'

In [None]:
!zip -r auto_2000_new_embed_full.zip /content/auto_2000_new_embed_full

  adding: content/auto_2000_new_embed_full/ (stored 0%)
  adding: content/auto_2000_new_embed_full/chroma.sqlite3 (deflated 62%)
  adding: content/auto_2000_new_embed_full/5fb77906-11ff-49fa-937b-4aa4d125d5c1/ (stored 0%)
  adding: content/auto_2000_new_embed_full/5fb77906-11ff-49fa-937b-4aa4d125d5c1/header.bin (deflated 54%)
  adding: content/auto_2000_new_embed_full/5fb77906-11ff-49fa-937b-4aa4d125d5c1/data_level0.bin (deflated 14%)
  adding: content/auto_2000_new_embed_full/5fb77906-11ff-49fa-937b-4aa4d125d5c1/index_metadata.pickle (deflated 44%)
  adding: content/auto_2000_new_embed_full/5fb77906-11ff-49fa-937b-4aa4d125d5c1/length.bin (deflated 44%)
  adding: content/auto_2000_new_embed_full/5fb77906-11ff-49fa-937b-4aa4d125d5c1/link_lists.bin (deflated 77%)


# Everything below here is practice

#pinecone db

In [None]:
os.environ["PINECONE_API_KEY"] = "pcsk_57E51r_4HXHZF1zmY2KviiN9bbxpVmLSgPKUHgVU6NZvZaZQyfyd32hSDXAbQdGCSqyfVU"


import os
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

# Now do stuff
if 'auto-parts' not in pc.list_indexes().names():
  pc.create_index(
      name='auto-parts',
      dimension=1536,
     metric='euclidean',
     spec=ServerlessSpec(
         cloud='aws',
         region='us-east-1'
            )
        )

In [None]:
pip install --upgrade "pinecone[grpc]"

Collecting lz4>=3.1.3 (from pinecone[grpc])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting protobuf<5.0,>=4.25 (from pinecone[grpc])
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting protoc-gen-openapiv2<0.0.2,>=0.0.1 (from pinecone[grpc])
  Downloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl.metadata (1.5 kB)
Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protoc_gen_openapiv2-0.0.1-py3-none-any.whl (7.9 kB)
Installing collected packages: protobuf, lz4, protoc-gen-openapiv2
  Attempting uninstall: prot

In [None]:
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time
embeddings = pc.inference.embed(
    model="text-embedding-3-large",
    inputs=[d['text'] for d in data_output.txt],
    parameters={"input_type": "passage", "truncate": "END"}
)


#print(embeddings)

NameError: name 'data_output' is not defined

# Reading the vector database and chating, one for all


In [None]:
import os
import pandas as pd
from transformers import pipeline

# Langchain modules
from langchain.document_loaders import UnstructuredFileLoader, PyPDFDirectoryLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
    TokenTextSplitter,
)
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq

# Download Sentence Transformers Embedding From Hugging Face
import torch
from sentence_transformers import SentenceTransformer

# Set API Key for Groq
os.environ["GROQ_API_KEY"] = "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"



#embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L12-v2')
#embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-mpnet-base-v2')
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/average_word_embeddings_komninos') #Best

persist_directory = "/content/research_paper_new_embed_full"
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings
)
# retriever
retriever = vectordb.as_retriever()
# Download Sentence Transformers Embedding From Hugging Face
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
# LLM from Groq
llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0
)

# Configuration for conversation buffer memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="result")

# Create the QA chain with memory
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    memory=memory  # Pass the memory here
)

# Interactive loop for querying
print("Welcome to the chatbot. Type 'exit' to stop.")
while True:
    query = input("You: ")

    if query.lower() == 'exit':
        print("Exiting the chatbot. Goodbye!")
        break

    # Get the response from the QA chain
    response = qa_chain.invoke({"query": query})

    # Print the result
    print("Bot: ", response["result"])

    # Optionally print the source document (first result)
    if response.get("source_documents"):
        print("Source document: ", response["source_documents"][0].metadata["source"])

    # Debug: Show current conversation memory (optional)
    print("\nMemory so far:\n", memory.buffer)

# Chatbot using prompt

In [None]:
# Create a prompt template
chatbot_prompt = """
You are an automotive parts assistant. When a user asks about their vehicle, you will refer to relevant documents and provide guidance in a concise, clear manner. Your goal is to stay under 1,000 tokens for each response, including all necessary details.

Here are the steps to follow:

1. **Identify the Vehicle and Parts**: Determine what part the user is asking about based on their question. If it’s unclear, ask clarifying questions to understand the model, make, and year of the vehicle.

2. **Provide Pricing**: Always provide the price for the part requested. If you don’t have the exact year of the vehicle the user asks about, provide the price for the earliest year available in your database and inform the user that you cannot help with the exact year requested. You should always mention the earliest year available, even if the part is not available for the requested year.

3. **Include Related Subcategories**: If the part requested falls under a category that has subcategories (e.g., "engine parts" has "fuel injectors", "oil filters", etc.), list those subcategories with their prices, if available.

4. **Mention Fluids**: If the part requested is related to fluids (e.g., oil, transmission fluid), also mention the fluids associated with the part and their availability/price.

5. **Maintain Chat Context**: Keep track of previous conversations and refer to them when necessary to provide consistent follow-up answers. For example, if the user has already asked about a part and later asks about another part from the same vehicle, refer back to previous details such as model, year, or part-related info.

6. **Structure the Response**:
   - Begin by acknowledging the vehicle type and confirming details, especially if the model year or make was mentioned.
   - Provide the price for the part.
   - List any relevant subcategories and related fluids.
   - If no exact match for the year is found, state the earliest year available in the database and explain the limitation.

**Keep responses concise, with no more than 1,000 tokens. If the response exceeds the token limit, trim unnecessary details.**

Conversation so far:
{chat_history}

User's question:
{user_input}

"""

# Define the chatbot function
def chatbot():
    print("Welcome to the Car Issue Chatbot! Type 'exit' to end the conversation.")

    # Initialize chat history
    chat_history = []

    while True:
        # Get user input
        query = input("\n **You**: ")

        # Exit condition
        if query.lower() == "exit":
            print("Chatbot: Goodbye!")
            break

        # Automatically add the user message to the chat history
        chat_history.append({"role": "user", "content": query})

        # Create prompt for the current conversation context
        prompt = chatbot_prompt.format(
            chat_history="\n".join([f"{message['role'].capitalize()}: {message['content']}" for message in chat_history]),
            user_input=query
        )

        # Retrieve documents from both vector databases
        combined_results = retriever(query)

        # Pass the combined results to the chain
        response = qa_chain.run(
            input_documents=combined_results,
            question=prompt  # Passing the user query directly
        )

        # Print the result
        print(f"\n **SBG**: {response}")

        # Add assistant's response to chat history automatically
        chat_history.append({"role": "assistant", "content": response})

# Run the chatbot
chatbot()


Welcome to the Car Issue Chatbot! Type 'exit' to end the conversation.

 **You**: 2000 Toyota Rav 4 brakes


TypeError: 'VectorStoreRetriever' object is not callable

# Embedding using BERT Dynamic embedders

In [None]:
!pip install transformers



In [None]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
!matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

/bin/bash: line 1: matplotlib: command not found


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

#BERT, FAISS

In [None]:
# Install necessary libraries
!pip install chromadb==0.5.5 langchain==0.2.11 transformers==4.43.2 sentence-transformers==3.0.1 faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:


import os
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

# Set paths and initialize variables
file_path = "/content/sbg_text1.txt"
persist_directory = "faiss_bhagavad_geeta_index"

# Load and preprocess documents
loader = TextLoader(file_path)
documents = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,  # Adjust for token-based splitting
    chunk_overlap=50
)
text_chunks = text_splitter.split_documents(documents)

# Use Sentence-BERT (optimized version)
sbert_model_name = "sentence-transformers/all-mpnet-base-v2"
sbert_model = SentenceTransformer(sbert_model_name)

# Embed documents and initialize FAISS
faiss_index = FAISS.from_documents(
    documents=text_chunks,
    embedding=sbert_model.encode,
    normalize_embeddings=True
)
faiss_index.save_local(persist_directory)

# Reload FAISS vector store
faiss_index = FAISS.load_local(persist_directory, sbert_model.encode)

# Initialize a transformer-based LLM
hf_pipeline = pipeline(
    "text-generation",
    model="gpt2",  # Replace with a larger LLM if needed
    tokenizer="gpt2",
    device=0  # Use GPU if available
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Create a RetrievalQA chain
retriever = faiss_index.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# Chatbot function
def chatbot():
    print("Welcome to the Enhanced Car Issue Chatbot! Type 'exit' to end the conversation.")

    while True:
        query = input("\n **You**: ")

        if query.lower() == "exit":
            print("Chatbot: Goodbye!")
            break

        response = qa_chain.invoke({"query": query})
        answer = response["result"]
        source = response["source_documents"][0].metadata.get("source", "Unknown source")
        print(f"\n **Service Advisor** : {answer}")
        print(f"\n **Source**: {source}")

# Run the chatbot
chatbot()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

AttributeError: 'function' object has no attribute 'embed_documents'