In [1]:
!pip install torch transformers accelerate bitsandbytes langchain langchain_experimental  sentence-transformers faiss-gpu pypdf ragatouille

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting langchain
  Downloading langchain-0.1.19-py3-none-any.whl.metadata (13 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.0.58-py3-none-any.whl.metadata (2.1 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting ragatouille
  Downloading ragatouille-0.0.8.post2-py3-none-any.whl.metadata (15 kB)
Collecting langchain-community<0.1,>=0.0.38 (from langchain)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.52 (from langchain)
  Downloading langchain_core-0.1.52-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_tex

In [2]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import transformers
# for LLM
from torch import cuda, bfloat16
from transformers import BitsAndBytesConfig
# For vector database
from langchain_community.document_loaders import PyPDFDirectoryLoader
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from ragatouille import RAGPretrainedModel
from langchain_text_splitters import CharacterTextSplitter
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers import ContextualCompressionRetriever

import warnings
warnings.filterwarnings('ignore')
import logging

# Set the logging level to suppress warnings
logging.basicConfig(level=logging.ERROR)


In [3]:

def create_knowledgebase(folder_path, embedding_model, chunk_size = 1000, chunk_overlap = 200):
  print(f"Loading pdf from {data_loc}...")
  loader = PyPDFDirectoryLoader(data_loc)
  docs = loader.load()
  # Splitter specific to embedding model
  # EMBEDDING_MODEL_NAME_small = "thenlper/gte-small"

  text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
          AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
          chunk_size=chunk_size,
          chunk_overlap=chunk_overlap,
          add_start_index=True,
          strip_whitespace=True
      )
  # text_splitter = SemanticChunker(embedding_model, breakpoint_threshold_type="percentile")
  # splits = text_splitter.create_documents([d.page_content for d in docs])
  print("Splitting documents...")
  splits = text_splitter.split_documents(docs)
  print("Creating vector database")
  KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
      splits, embedding_model, distance_strategy=DistanceStrategy.COSINE
  )
  print("Successfully created knowledge base")
  return KNOWLEDGE_VECTOR_DATABASE

data_loc = "/kaggle/input/rag-dataset-1/ASR_Papers-20240509T054617Z-001/ASR_Papers"
EMBEDDING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
chunk_size = 1000
chunk_overlap = 200
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)
# Knowledge database creation
KNOWLEDGE_VECTOR_DATABASE = create_knowledgebase(data_loc, embedding_model, chunk_size=chunk_size, chunk_overlap=chunk_overlap )

# for Compression filters
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever(search_kwargs={"k": 20, "include_metadata": True})
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding_model)
relevant_filter = EmbeddingsFilter(embeddings=embedding_model,k=5) # either k=ineteger or similarity_threshold=0.76
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=retriever
)

# for both compression and reranking
relevant_filter_1 = EmbeddingsFilter(embeddings=embedding_model,k=10) # either k=ineteger or similarity_threshold=0.76
pipeline_compressor_1 = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter_1]
)
compression_retriever_1 = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor_1, base_retriever=retriever
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading pdf from /kaggle/input/rag-dataset-1/ASR_Papers-20240509T054617Z-001/ASR_Papers...
Splitting documents...


Token indices sequence length is longer than the specified maximum sequence length for this model (1438 > 512). Running this sequence through the model will result in indexing errors


Creating vector database
Successfully created knowledge base


In [4]:
access_token = "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"
model_id_map = {"llama2":"meta-llama/Llama-2-7b-chat-hf",
                "mistral":"mistralai/Mistral-7B-Instruct-v0.2",
                "gemma":"google/gemma-1.1-7b-it",
                "llama3":"meta-llama/Meta-Llama-3-8B-Instruct"}
model_id = model_id_map["llama3"]

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token)

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)
print("Loading LLM")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    token=access_token,
    quantization_config=bnb_config,

)
print("Creating pipeline")
llm_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=True,
    max_new_tokens=500

)
print("Loading re-ranker")
RERANKER = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")


Loading tokenizer


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading LLM


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Creating pipeline


2024-05-09 15:00:58.929980: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 15:00:58.930086: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 15:00:59.043327: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading re-ranker


artifact.metadata:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/405 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
pip install -i https://pypi.org/simple/ bitsandbytes

In [5]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """Use the information contained in the context along with the knowledge you have and give a comprehensive answer to the question.
Respond only to the question asked.""",
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Use the information contained in the context along with the knowledge you have and give a comprehensive answer to the question.
Respond only to the question asked.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context:
{context}
---
Now here is the question you need to answer.

Question: {question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [6]:
import json
import openpyxl

def writing_in_json(reranking , compression_filter, search_type,user_query,want_to_see_llm_response_without_rag):
    relevant_docs = []  # Initialize relevant_docs as an empty list
    retrieved_docs = [] 
    if want_to_see_llm_response_without_rag:
#         print("####################### Answer without RAG #######################")
        answer = llm_pipeline(user_query)[0]["generated_text"]
        # print(answer_wo_rag)
    else :
      if reranking and not compression_filter:
          if search_type == 'similarity':
              # Similarity search
              retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=20)
          elif search_type == 'mmr':
              # Maximum marginal relevance search
              retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.max_marginal_relevance_search(query=user_query, k=20)
          retrieved_docs = [doc.page_content for doc in retrieved_docs]
          # Re-ranking
          relevant_docs = RERANKER.rerank(user_query, retrieved_docs, k=5)
          relevant_docs = [doc["content"] for doc in relevant_docs]

      elif compression_filter and not reranking:
          # Compression filter
          compressed_docs = compression_retriever.invoke(user_query)
          relevant_docs = [doc.page_content for doc in compressed_docs]

      elif compression_filter and reranking:
          compressed_docs = compression_retriever_1.invoke(user_query)
          relevant_docs = [doc.page_content for doc in compressed_docs]
          relevant_docs = RERANKER.rerank(user_query, relevant_docs, k=5)
          relevant_docs = [doc["content"] for doc in relevant_docs]
      context = "\nExtracted documents:\n"
      context += "".join([f"\nDocument {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

      final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)
      answer = llm_pipeline(final_prompt)[0]["generated_text"]
      answer = answer.split("<|end_header_id|>")[-1]
    
        
    return answer


In [None]:
# Open the Excel file
workbook = openpyxl.load_workbook('/kaggle/input/rag-dataset-1/Evaluation_data.xlsx')
sheet_name = "Espnet"
sheet = workbook[sheet_name]
print(sheet)

# Initialize output dictionary
output = {}

# Iterate through rows in the first sheet
for row in sheet.iter_rows(min_row=2, values_only=True):  # Skip header row
#     print(row)
    user_query = row[1]  # Assuming the first column is the user query
    print(user_query)
    chat_gpt_answer = row[2]
#     print(chat_gpt_answer)
    output[user_query] = {
        "chat_gpt_answer": chat_gpt_answer,
        "llm_without_rag": writing_in_json(reranking=True, compression_filter=False, search_type='similarity', user_query=user_query,want_to_see_llm_response_without_rag = True),
        "rag_with_compression": writing_in_json(reranking=False, compression_filter=True, search_type=None, user_query=user_query,want_to_see_llm_response_without_rag = False),
        "rag_with_rerank_similarity": writing_in_json(reranking=True, compression_filter=False, search_type='similarity', user_query=user_query,want_to_see_llm_response_without_rag = False),
        "rag_with_rerank_mmr": writing_in_json(reranking=True, compression_filter=False, search_type='mmr', user_query=user_query,want_to_see_llm_response_without_rag = False),
        "rag_with_compression_rerank": writing_in_json(reranking=True, compression_filter=True, search_type='similarity', user_query=user_query,want_to_see_llm_response_without_rag = False)

        }

    # Write output to JSON file with sheet name as filename
    output_filename = "/kaggle/working/Espnet.json"
    with open(output_filename, "w") as f:
        json.dump(output, f, indent=4)

# Close the workbook
workbook.close()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<Worksheet "Espnet">
What is the main focus of the ESPnet toolkit introduced in the paper?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 15.57it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Which neural network toolkits are utilized as the main deep learning engine in ESPnet?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  9.84it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/opt/conda/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  F

How does ESPnet differ from other open-source ASR toolkits, such as Kaldi, in terms of architecture and functionality?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 18.27it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What are the key features of ESPnet's end-to-end ASR setup, and how do they contribute to its performance?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.71it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 17.93it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is the significance of ESPnet's adoption of both connectionist temporal classification (CTC) and attention-based encoder-decoder network architectures?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 23.41it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How does ESPnet handle the training process, particularly regarding multiobjective learning and label smoothing techniques?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.56it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 21.43it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What are the advantages of using the warp CTC library in ESPnet, and how does it impact training efficiency?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  9.80it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Explain the process of joint decoding in ESPnet and its role in improving recognition accuracy.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.52it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 20.20it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


How does ESPnet incorporate language models into the decoding process, and what benefits does this provide?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.71it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00, 18.14it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What experimental results and comparisons are presented in the paper regarding the performance of ESPnet, particularly in tasks such as WSJ, CSJ, and HKUST?


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.63it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:00<00:00,  3.39it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
