In [None]:
!pip install torch transformers accelerate bitsandbytes langchain langchain_experimental  sentence-transformers faiss-gpu pypdf ragatouille

Collecting accelerate
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.1.19-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_experimental
  Downloading langchain_experimental-0.0.58-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━

In [None]:
import sys
print(sys.version)

3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import transformers
# for LLM
from torch import cuda, bfloat16
from transformers import BitsAndBytesConfig
# For vector database
from langchain_community.document_loaders import PyPDFDirectoryLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from ragatouille import RAGPretrainedModel
from langchain_text_splitters import CharacterTextSplitter
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever

import warnings
warnings.filterwarnings('ignore')
import logging

# Set the logging level to suppress warnings
logging.basicConfig(level=logging.ERROR)


# Loading, splitting and vectordb creation

In [None]:

def create_knowledgebase(folder_path, embedding_model, chunk_size = 1000, chunk_overlap = 200):
  print(f"Loading pdf from {data_loc}...")
  loader = PyPDFDirectoryLoader(data_loc)
  docs = loader.load()
  # Splitter specific to embedding model
  # EMBEDDING_MODEL_NAME_small = "thenlper/gte-small"

  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
          AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
          chunk_size=chunk_size,
          chunk_overlap=chunk_overlap,
          add_start_index=True,
          strip_whitespace=True
      )
  # text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")
  # splits = text_splitter.create_documents([d.page_content for d in docs])
  print("Splitting documents...")
  splits = text_splitter.split_documents(docs)
  print("Creating vector database")
  KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
      splits, embedding_model, distance_strategy=DistanceStrategy.COSINE
  )
  print("Successfully created knowledge base")
  return KNOWLEDGE_VECTOR_DATABASE

data_loc = "/content/drive/MyDrive/MS IIITH/MS CourseWork/SMAI/Project/asr_papers"
EMBEDDING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
chunk_size = 1000
chunk_overlap = 200
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)
# Knowledge database creation
KNOWLEDGE_VECTOR_DATABASE = create_knowledgebase(data_loc, embedding_model, chunk_size=chunk_size, chunk_overlap=chunk_overlap )

# for Compression filters
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_kwargs={"k": 20, "include_metadata": True})
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_model)
relevant_filter = EmbeddingsFilter(embeddings=embedding_model,k=5) # either k=ineteger or similarity_threshold=0.76
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=retriever
)

# for both compression and reranking
relevant_filter_1 = EmbeddingsFilter(embeddings=embedding_model,k=10) # either k=ineteger or similarity_threshold=0.76
pipeline_compressor_1 = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter_1]
)
compression_retriever_1 = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor_1, base_retriever=retriever
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading pdf from /content/drive/MyDrive/MS IIITH/MS CourseWork/SMAI/Project/asr_papers...
Splitting documents...
Creating vector database
Successfully created knowledge base


# Loading LLMs

In [None]:
access_token = "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"
model_id_map = {"llama2":"meta-llama/Llama-2-7b-chat-hf",
                "mistral":"mistralai/Mistral-7B-Instruct-v0.2",
                "gemma":"google/gemma-1.1-7b-it",
                "llama3":"meta-llama/Meta-Llama-3-8B-Instruct"}
model_id = model_id_map["llama3"]

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
print("Loading LLM")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    token=access_token,
    quantization_config=bnb_config,

)
print("Creating pipeline")
llm_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=True,
    max_new_tokens=500

)
print("Loading re-ranker")
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")


Loading tokenizer


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading LLM


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Creating pipeline
Loading re-ranker


artifact.metadata:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Retrieval+Generation

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Use the information contained in the context along with the knowledge you have and give a comprehensive answer to the question.
Respond only to the question asked. Provide the number of the source document when relevant.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)




<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the information contained in the context along with the knowledge you have and give a comprehensive answer to the question.
Respond only to the question asked. Provide the number of the source document when relevant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context:
{context}
---
Now here is the question you need to answer.

Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [None]:
# user_query = "What is attention based encoder-decoder model in automatic speech recognition?"

reranking = True
search_type = 'similarity'  # either 'similarity' or 'mmr' --- only applicable in reranking(i.e if reranking=True)
Compression_filter = True
show_answer_with_retieved_content = False
want_to_see_llm_response_without_rag = True
while True:
  print("----------------------------------------------------------------------------------------------------------------")
  print("Enter q as input to exit.")
  user_query = input("Please enter the query: ")
  if user_query.lower() == 'q':
    print("Quitting.")
    break

  if reranking and not Compression_filter:
    if search_type == 'similarity':
      #Similairty search
      retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=20)
    elif search_type == 'mmr':
      #Maximum marginal relevance search
      retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.max_marginal_relevance_search(query=user_query, k=20)
    retrieved_docs = [doc.page_content for doc in retrieved_docs]
    # Re ranking
    relevant_docs = RERANKER.rerank(user_query, retrieved_docs, k=5)
    relevant_docs = [doc["content"] for doc in relevant_docs]

  elif Compression_filter and not reranking:
    # Compression filetr
    compressed_docs = compression_retriever.invoke(user_query)
    relevant_docs = [doc.page_content for doc in compressed_docs]

  elif Compression_filter and reranking:
    compressed_docs = compression_retriever_1.invoke(user_query)
    relevant_docs = [doc.page_content for doc in compressed_docs]
    relevant_docs = RERANKER.rerank(user_query, relevant_docs, k=5)
    relevant_docs = [doc["content"] for doc in relevant_docs]


  context = "\nExtracted documents:\n"
  context += "".join([f"\nDocument {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

  final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)
  answer = llm_pipeline(final_prompt)[0]["generated_text"]
  print("####################### Answer wit RAG #######################")
  if show_answer_with_retieved_content:
    print(answer)
  else:
    print(answer.split("<|end_header_id|>")[-1])

  if want_to_see_llm_response_without_rag:
    print("####################### Answer without RAG #######################")
    answer_wo_rag = llm_pipeline(user_query)[0]["generated_text"]
    print(answer_wo_rag)
  print("----------------------------------------------------------------------------------------------------------------")


----------------------------------------------------------------------------------------------------------------
Enter q as input to exit.
Please enter the query: what is asr?


100%|██████████| 1/1 [00:00<00:00, 23.83it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


####################### Answer wit RAG #######################


Based on the provided context, Automatic Speech Recognition (ASR) is defined as:

* "the process and the related technology for converting the speech signal into its corresponding sequence of words or other linguistic entities by means of algorithms implemented in a machine" (Document 0).
* "an independent, machine-based process of decoding and transcribing oral speech" (Document 1).
* "a technology which converts voice into text transcriptions, and is one of the core techniques in man-to-machine communications" (Document 2).
* "an important technology to enable and improve the human–human and human–computer interactions" (Document 3).

In summary, ASR is the process of converting spoken language into its written or text-based form using algorithms and machines.
####################### Answer without RAG #######################
what is asr? |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |

100%|██████████| 1/1 [00:00<00:00,  6.40it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


####################### Answer wit RAG #######################


According to Document 0 and Document 2, wav2vec is a representative technology in recent advances in end-to-end automatic speech recognition. Specifically, wav2vec 2.0 [10] is mentioned as one of the key technologies in recent influential wav2vec series work [10, 243].

Additionally, Document 2 provides more information about wav2vec 2.0, stating that it uses self-supervised learning (SSL) with unlabeled data and DataAugment, and achieves a WER of 1.8/3.3 on Librispeech.

It appears that wav2vec is a type of self-supervised learning approach for speech recognition, which uses unlabeled data and data augmentation techniques to improve the robustness and accuracy of the model.
####################### Answer without RAG #######################
what is wav2vec?](https://github.com/wav2vec/wav2vec)

wav2vec is a self-supervised speech representation learning approach that uses a contrastive loss function to learn a robust and 