In [None]:
%%capture
!pip install -r requirements.txt --quiet

In [None]:
%%capture
# Importing necessary libraries and handling warnings
import warnings
warnings.filterwarnings('ignore')  # Ignore warnings to keep notebook clean

import torch
import torch.nn as nn
from transformers import (
    pipeline,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
)
from langchain.llms import HuggingFacePipeline

from IPython.display import Markdown
import re
import random
import pandas as pd

## Ingest
This section is dedicated to loading and preparing data for processing. Here, we specifically handle markdown files, loading them from a local directory.

In [None]:
from langchain_community.document_loaders import DirectoryLoader

# Loading markdown files from the specified directory
# Using a glob pattern to select all markdown files recursively
loader_md = DirectoryLoader("./content/", glob="**/*.md")
try:
    md_data = loader_md.load()
    print("Markdown files loaded successfully:", len(md_data))
except Exception as e:
    print("Failed to load markdown files:", e)

## Document Splitters
In this section, we define how documents are split into manageable parts for further analysis or processing. This is crucial for handling large texts efficiently.

In [None]:
import random
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

# Setting up the Recursive Character Text Splitter
recur_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=60,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""],
    is_separator_regex=True,
)

# Assume md_data is a list of some document objects
# Use the regex preprocessing and splitter for markdown data
for doc in md_data:
    doc.page_content = re.sub("\n{3,}", "\n", doc.page_content)
    doc.page_content = re.sub(" {2,}", " ", doc.page_content)

# Splitting Markdown documents
md_data_splits = recur_splitter.split_documents(md_data)

len(md_data_splits)

# Print a random chunk from Markdown content
print(random.choice(md_data_splits).page_content)

## Vector Stores

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

### Using embeddings by MPNET
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [None]:
# Import vectorstore
from langchain.vectorstores import FAISS

# Generate and store embeddings
vectordb = FAISS.from_documents(documents=md_data_splits, embedding=hf_embeddings)

In [None]:
# Assuming 'rdocs' is a list of Document objects as shown above
def print_retrieved_documents(documents):
    for idx, doc in enumerate(documents, start=1):
        print(f"Document {idx}:")
        print(f"Source: {doc.metadata.get('source')}\n")
        # Splitting the content into paragraphs for better readability
        paragraphs = doc.page_content.split("\n\n")
        for paragraph in paragraphs:
            print(paragraph)
        print("\n" + "-" * 80 + "\n")  # Add a separator line between documents

# Retrieve documents using a retriever from vectordb
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
rdocs = retriever.get_relevant_documents("chatbot")

# Improved print of retrieved documents
print_retrieved_documents(rdocs)

In [None]:
# Query to retrieve similar chunks
query = "Kafka?"

# Retrieve similar chunks based on relevance. We only retrieve 'k' most similar chunks
similar_chunks = vectordb.similarity_search_with_relevance_scores(query, k=3)

# Format document to text format
retrieved_text = [chunk[0].page_content for chunk in similar_chunks]
relevance_score = [chunk[1] for chunk in similar_chunks]

# Store and print as a dataframe
retrieved_chunks = pd.DataFrame(
    list(zip(retrieved_text, relevance_score)),
    columns=["Retrieved Chunks", "Relevance Score"],
)
with pd.option_context("display.max_colwidth", None):
    display(retrieved_chunks)

## LLM Chain

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFaceTextGenInference
from langchain.chains import LLMChain
from langchain.memory import VectorStoreRetrieverMemory

llm = HuggingFaceTextGenInference(
            inference_server_url="https://hf-tgi-server-llms.apps.cluster-45cdc.45cdc.openshift.opentlc.com",
            max_new_tokens=512,
            top_k=10,
            top_p=0.95,
            typical_p=0.95,
            temperature=0.1,
            repetition_penalty=1.175
)

llm_template = """ Answer the question below.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Never Hallucinate.
Keep the answer as concise as possible.

Question: {question}
Answer:
"""

qa_prompt_template = PromptTemplate.from_template(llm_template)

# retriever = vectordb.as_retriever(search_kwargs=dict(k=1))
memory = VectorStoreRetrieverMemory(retriever=retriever)
chain = LLMChain(llm=llm, prompt=qa_prompt_template, verbose=True, memory=memory)

In [None]:
chain.invoke({"question": "Why the application needs Kafka?"})