In [None]:
# install dependencies
%pip install llampip install llama-indexa-index --quiet
%pip install chromadb --quiet
%pip install -U sentence-transformers --quiet
%pip install llama-index-vector-stores-chroma --quiet
%pip install llama-index-embeddings-huggingface --quiet
%pip install llama-index-embeddings-instructor --quiet
%pip install transformers accelerate
%pip install llama-index-readers-web
%pip install -i https://pypi.org/simple/ bitsandbytes --quiet
%pip install llama-index-llms-huggingface --quiet
%pip install llama-index-retrievers-bm25 --quiet

In [1]:
import os
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
import chromadb
from sentence_transformers import SentenceTransformer
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, Settings
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.retrievers import VectorIndexRetriever
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [12]:
# create documents directory if not existing
if not os.path.exists(os.getcwd() + '/documents'):
    os.makedirs('documents')

# creating document folder if not existing
file_path = os.getcwd() + '/documents'
file_path = r'/kaggle/input/datetime-parser-dataset'
try:
  documents = SimpleDirectoryReader(file_path).load_data()
  parser = SentenceSplitter()
  nodes = parser.get_nodes_from_documents(documents)
except Exception as error:
  print(f'{error}')

In [3]:
embed_model = HuggingFaceEmbedding(model_name="mixedbread-ai/mxbai-embed-large-v1")

In [13]:
chroma_client = chromadb.EphemeralClient()
try:
  chroma_client.delete_collection('embedding_docs')
except Exception:
  pass
chroma_collection = chroma_client.create_collection("embedding_docs")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model, embed_batch_size=100
)

In [6]:
secret_label = "HF_TOKEN"
hf_token = UserSecretsClient().get_secret(secret_label)

login(token = hf_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
selected_model = "mistralai/Mistral-7B-Instruct-v0.2"

SYSTEM_PROMPT = f"""You are a helptul AI assistant, you are tasked to do either one of the two tasks which will be specified in the query given.
Task1 is to generate an answer to the query using the context in step by step method similar to Chain of Thought.
Task2 is to generate a concise and comprehensive answer to the query using the contents of the context retrieved from documents or a codebase.
Perform task1 or task2 depending on the type specified in the query """

query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

llm = HuggingFaceLLM(
    model_name=selected_model,
    tokenizer_name=selected_model,
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    context_window=8000,
    model_kwargs={"token": hf_token, "quantization_config": quantization_config},
    tokenizer_kwargs={"token": hf_token},
    device_map='auto',
    max_new_tokens=3000
)

Settings.llm = llm

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [14]:
query = f"explain the regex used for timefhuman"

In [15]:
def retrieveNodes(query, nodes, k):
    retriever = BM25Retriever.from_defaults(nodes = nodes, similarity_top_k=k)
    nodes = retriever.retrieve(query) 
    return nodes if nodes else []

cot_nodes = retrieveNodes(query, nodes, 4)
best_nodes = sorted(cot_nodes, key=lambda x: x.get_score(), reverse=True)[:2]

task1_prompt = f"""task1: "Chain of Thought answer" - Use the context from the documents retrieved to give a step by step answer to the query specified.
Focus on incorporating key terms, synonyms, related concepts, and descriptive phrases to enhance the answer's scope and accuracy.
Example:
context: {best_nodes},
query: {query},
answer: Let's think step by step.... generate the answer step by step... max 5-6 steps"""

query_engine = index.as_query_engine(streaming=True)
streaming_response = query_engine.query(task1_prompt)
streaming_response.print_response_stream()
cot_output = streaming_response.response_txt

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. The context provided includes a list of regular expression (regex) patterns called `timefhuman_patterns`.
2. One of the patterns in `timefhuman_patterns` is `r'(\:\d{2}:\d{2}:\d{2})'`, which matches a time format with hours, minutes, and seconds.
3. Another pattern in `timefhuman_patterns` is `r'\b(?:last|past|this|next|coming|upcoming)\s*(mon|tue|wed|thu|fri|sat|sun|monday|tuesday|wednesday|thursday|friday|saturday|sunday)\s*(?:\s\d{1,2}(?::\d{2})?(?:\sUTC)?)?(?:\sfrom\s\d{1,2}(?::\d{2})?(?:\sUTC)?)?(?:\sto\s\d{1,2}(?::\d{2})?(?:\sUTC)?)?\b`. This pattern matches phrases like "last Monday", "next Tuesday", "yesterday at 5 PM UTC", etc.
4. The pattern starts by matching one or more words like "last", "past", "this", "next", "coming", or "upcoming".
5. Then it matches a day of the week or its abbreviated form, such as "Monday" or "Mon".
6. Optionally, it matches a time in the format "HH:MM [UTC|UTC]", which can be followed by "from" and another time, or "to" and another time.
7. The 

In [18]:
qa_nodes = retrieveNodes(query+cot_output, nodes, 3)
best_nodes = sorted(qa_nodes, key=lambda x: x.get_score(), reverse=True)[:2]

task2_prompt = f"""task2: "Answer using context" - You are given context: {best_nodes} retrieved using the query: {query}. You have to to use the content
provided in the context and output a concise and comprehensive answer to the query"""

streaming_response = query_engine.query(task2_prompt)
streaming_response.print_response_stream()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


The regex patterns used for `timefhuman` in the context are defined in the script as follows:

```python
timefhuman_patterns = [
   r'(?:(?:last|past|this|next|coming|upcoming)\s)?(?:mon(?:day)?|tue(?:sday)?|wed(?:nesday)?|thu(?:rsday)?|fri(?:day)?|sat(?:urday)?|sun(?:day)?)(?:\s\d{1,2}(?::\d{2})?(?:\sUTC)?)?(?:\sfrom\s\d{1,2}(?::\d{2})?(?:\sUTC)?)?(?:\sto\s\d{1,2}(?::\d{2})?(?:\sUTC)?)',
   r'\b(?:yesterday|today|tomorrow)(?:\s\d{1,2}(?::\d{2})?(?:\sUTC)?(?:\sto\s\d{1,2}(?::\d{2})?(?:\sUTC)?)?)?\b',
   r'\b((last|past|coming|next|upcoming)\s(mon|tue|wed|thu|fri|sat|sun|monday|tuesday|wednesday|thursday|friday|saturday|sunday)\s(noon|afternoon|midnight))\b'
]
```

The first pattern `timefhuman_patterns[0]` matches various phrases related to days of the week and their relative positions in time, such as "last Monday", "next Tuesday", "yesterday at 3 PM UTC", "tomorrow at 5 PM UTC", and "from yesterday to tomorrow". The pattern also allows for optional hours, minutes, and UTC timezone sp