In [None]:
%%capture
!pip install -r requirements.txt --quiet

In [None]:
!pip list
# __import__('pysqlite3')
# import pysqlite3
# sys.modules['sqlite3'] = sys.modules["pysqlite3"]
# import chromadb

In [None]:
%%capture
import torch
import torch.nn as nn
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
)
from langchain.llms import HuggingFacePipeline

import warnings
from IPython.display import Markdown
import re
import random
import pandas as pd

## Ingest

In [None]:
from langchain.document_loaders import UnstructuredURLLoader

# List of WCA URLs for the loader.
urls = [
    "https://www.worldcubeassociation.org/regulations/",
    "https://www.worldcubeassociation.org/regulations/guidelines.html",
    "https://www.worldcubeassociation.org/regulations/scrambles/"
]

# Defining the URL Loader
loader = UnstructuredURLLoader(urls=urls)

# Loading the data
data = loader.load()

# Pre-processing the data using regex
data[0].page_content = re.sub("\n{3,}", "\n", data[0].page_content)
data[0].page_content = re.sub(" {2,}", " ", data[0].page_content)

In [61]:
from langchain_community.document_loaders import DirectoryLoader

# load from documents directory
loader = DirectoryLoader('./content/', glob="**/*.md")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

texts = text_splitter.split_documents(docs)

len(texts)

vectordb = FAISS.from_documents(documents=texts,
                                 embedding=embeddings)

retriever = vectordb.as_retriever(search_kwargs={'k': 2})

rdocs = retriever.get_relevant_documents("chatbot")
rdocs

[Document(page_content='Inference Server Setup\n\nThe inference server is responsible for performing text inference tasks using Hugging Face\'s Large Language Models. You need to specify the URL of the inference server that the application will communicate with.\n\nINFERENCE_SERVER_URL should be set to the URL of your Hugging Face inference server. If you\'re running the server locally for testing, you can use "http://localhost:3000/". For production or cloud environments, you would replace this with the actual URL of your deployed inference server.\n\nKafka Setup\n\nThis application uses Kafka for message queueing, consuming messages from a chat topic, processing them, and then producing responses to an answer topic.\n\nKAFKA_SERVER specifies the address of your Kafka server. If running locally, it\'s typically set to "localhost:9092". For production, this would be the address of your Kafka cluster.', metadata={'source': 'content/readme.md'}),
 Document(page_content='KAFKA_SERVER spec

## Document Splitters

In [None]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

# Using the recursive character splitter
recur_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=60,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""],
    is_separator_regex=True,
)

# Performing the splits using the splitter
data_splits = recur_splitter.split_documents(data)

# Printing a random chunk
print(random.choice(data_splits).page_content)

## Vector Stores

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

### Using embeddings by MPNET
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [None]:
# Import vectorstore
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

# Define the location to persist data
persist_directory = "chroma/"
!rm -rf chroma

# Generate and store embeddings
vectordb = FAISS.from_documents(
    documents=data_splits, embedding=hf_embeddings
)

In [None]:
# Query to retrieve similar chunks
query = "Are hand warmers considered as electronic devices?"

# Retrieve similar chunks based on relevance. We only retrieve 'k' most similar chunks
similar_chunks = vectordb.similarity_search_with_relevance_scores(query, k=3)

# Format document to text format
retrieved_text = [chunk[0].page_content for chunk in similar_chunks]
relevance_score = [chunk[1] for chunk in similar_chunks]

# Store and print as a dataframe
retrieved_chunks = pd.DataFrame(
    list(zip(retrieved_text, relevance_score)),
    columns=["Retrieved Chunks", "Relevance Score"],
)
with pd.option_context("display.max_colwidth", None):
    display(retrieved_chunks)

## LLM Chain

In [62]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceTextGenInference
from langchain.chains import LLMChain
from langchain.memory import VectorStoreRetrieverMemory

llm = HuggingFaceTextGenInference(
            inference_server_url="https://hf-tgi-server-llms.apps.cluster-45cdc.45cdc.openshift.opentlc.com",
            max_new_tokens=512,
            top_k=10,
            top_p=0.95,
            typical_p=0.95,
            temperature=0.1,
            repetition_penalty=1.175
)

llm_template = """ Answer the question below.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Never Hallucinate.
Keep the answer as concise as possible.

Question: {question}
Answer:
"""

qa_prompt_template = PromptTemplate.from_template(llm_template)

# retriever = vectordb.as_retriever(search_kwargs=dict(k=1))
memory = VectorStoreRetrieverMemory(retriever=retriever)
chain = LLMChain(llm=llm, prompt=qa_prompt_template, verbose=True, memory=memory)

In [63]:
chain.invoke({"question": "Why the application needs Kafka?"})



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m Answer the question below.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Never Hallucinate.
Keep the answer as concise as possible.

Question: Why the application needs Kafka?
Answer:
[0m

[1m> Finished chain.[0m


{'question': 'Why the application needs Kafka?',
 'history': 'KAFKA_SERVER specifies the address of your Kafka server. If running locally, it\'s typically set to "localhost:9092". For production, this would be the address of your Kafka cluster.\n\nCONSUMER_TOPIC is the name of the Kafka topic from which the application will consume messages. This should be set to "chat" or whichever topic you have designated for incoming chat messages.\n\nPRODUCER_TOPIC is the name of the Kafka topic to which the application will produce processed messages. This is set to "answer", or any other topic name where you want the processed messages to be published.\n\nEnsure these settings are correctly configured to match your environment before running the application.\nInference Server Setup\n\nThe inference server is responsible for performing text inference tasks using Hugging Face\'s Large Language Models. You need to specify the URL of the inference server that the application will communicate with.\n