In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Sample Documents
documents = [
    "This is a list which containing sample documents.",
    "Keywords are important ofr keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [None]:
query="keyword-based search"

In [None]:
import re

def preprocess_text(text):
    # Convert text to lowercase
    text=text.lower()
    # Remove puncutation
    text=re.sub(r'[^a-z0-9\s]','',text)
    return text


In [None]:
[preprocess_text(doc) for doc in documents]

['this is a list which containing sample documents',
 'keywords are important ofr keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [None]:
print("Preprocessed Query:")
print(query)

Preprocessed Query:
keyword-based search


In [None]:
preprocessed_query = preprocess_text(query)

In [None]:
preprocessed_query

'keywordbased search'

In [None]:
vector=TfidfVectorizer()

In [None]:
X = vector.fit_transform(documents)

In [None]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.37796447, 0.        ,
        0.37796447, 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.        , 0.37796447, 0.        , 0.        ,
        0.37796447, 0.37796447],
       [0.        , 0.42693074, 0.3365971 , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.42693074, 0.        ,
        0.        , 0.3365971 , 0.3365971 , 0.        , 0.42693074,
        0.        , 0.        , 0.        , 0.3365971 , 0.        ,
        0.        , 0.        ],
       [0.46516193, 0.        , 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.46516193, 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.32555709, 0.        , 0.        ,
        0.       

In [None]:
X.toarray()[0]

array([0.        , 0.        , 0.        , 0.37796447, 0.        ,
       0.37796447, 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.        , 0.37796447, 0.        , 0.        ,
       0.37796447, 0.37796447])

In [None]:
query_embedding = vector.transform([preprocessed_query])

In [None]:
query_embedding.toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0.]])

In [None]:
similarities = cosine_similarity(X, query_embedding)

In [None]:
similarities

array([[0.        ],
       [0.3365971 ],
       [0.        ],
       [0.32555709]])

In [None]:
np.argsort(similarities, axis=0)

array([[0],
       [2],
       [3],
       [1]])

In [None]:
ranked_documents = [documents[i] for i in ranked_indices]


In [None]:
# Ranking
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()


In [None]:
ranked_indices

array([1, 3, 2, 0])

In [None]:
ranked_indices

array([1, 3, 2, 0])

In [None]:
# Output the ranked documents
for i, doc in enumerate(ranked_documents):
  print(f"Rank {i+1}: {doc}")

Rank 1: Keywords are important ofr keyword-based search.
Rank 2: Keyword-based search relies on sparse embeddings.
Rank 3: Document analysis involves extracting keywords.
Rank 4: This is a list which containing sample documents.


In [None]:
query

'keyword-based search'

In [None]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [None]:
# Sample search query (represented as a dense vector)
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])


In [None]:
# Calcualte cosine similarity between query and documents
similarities = cosine_similarity(document_embeddings, query_embedding)


In [None]:
ranked_indices = np.argsort(similarities, axis=0)[::-1].flatten()

In [None]:
ranked_indices

array([0, 2, 1])

In [None]:
# Output the ranked documents
for i, idx in enumerate(ranked_indices):
  print(f"Rank {i+1}: Document {idx+1}")

Rank 1: Document 1
Rank 2: Document 3
Rank 3: Document 2


In [None]:
doc_path="/content/AI Intern Assignment 2.pdf"

In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.3.1-py3-none-any.whl.metadata (7.4 kB)
Downloading pypdf-4.3.1-py3-none-any.whl (295 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/295.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-4.3.1


In [None]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.13 (from langchain_community)
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.30 (from langchain_community)
  Downloading langchain_core-0.2.33-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.99-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain_community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.21.3-py3-none-any.whl.metadata (7.1 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>

In [None]:
from langchain_community.document_loaders import PyPDFLoader

In [None]:
loader = PyPDFLoader(doc_path)

In [None]:
docs = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)


In [None]:
chunks = splitter.split_documents(docs)


In [None]:
chunks

[Document(metadata={'source': '/content/AI Intern Assignment 2.pdf', 'page': 0}, page_content='Retrieval\nAugmented\nGeneration(RAG)\nin\nLLM\nSection\n1:\nTheoretical\nKnowledge\n1.1.\nBasics\nof\nRetrieval-Augmented\nGeneration\n(RAG)\nQuestion:\nExplain\nthe\nconcept\nof\nRetrieval-Augmented\nGeneration\n(RAG).'),
 Document(metadata={'source': '/content/AI Intern Assignment 2.pdf', 'page': 0}, page_content='Generation\n(RAG).\nHow\ndoes\nit\ndiffer\nfrom\nstandard\ntext\ngeneration\nmethods\nin\nNLP?\n1.2.\nInformation\nRetrieval\nTechniques\nQuestion:What\nare\nDense\nPassage\nRetrieval\n(DPR)\nand\nSparse\nRetrieval?\nCompare'),
 Document(metadata={'source': '/content/AI Intern Assignment 2.pdf', 'page': 0}, page_content='Retrieval?\nCompare\ntheir\nstrengths\nand\nweaknesses.\n1.3.\nTransformer\nModels\nin\nNLP\n(10\nPoints)\nQuestion:\nBriefly\ndescribe\nthe\narchitecture\nof\nTransformer\nmodels\nand\ntheir\nrole\nin\nLLMs.\nHow\ndo\nthey'),
 Document(metadata={'source': '/cont

In [None]:
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings


In [None]:
HF_TOKEN="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"

In [None]:
embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN, model_name="BAAI/bge-base-en-v1.5")


In [None]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.112.1-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_pro

In [None]:
from langchain.vectorstores import Chroma

In [None]:
vectorstore = Chroma.from_documents(chunks, embeddings)

In [None]:
vectorstore_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [None]:
vectorstore_retriever


VectorStoreRetriever(tags=['Chroma', 'HuggingFaceInferenceAPIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x7938aabdd330>, search_kwargs={'k': 3})

In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever


In [None]:
keyword_retriever = BM25Retriever.from_documents(chunks)

In [None]:
keyword_retriever.k = 3

In [None]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[vectorstore_retriever, keyword_retriever],
    weights=[0.3, 0.7]
)

hybrid_score = (1-alpha)*sparse_score + alpha * dense_score

In [None]:
model_name = "HuggingFaceH4/zephyr-7b-beta"

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->bitsandbytes)
 

In [None]:
!pip install accelerate



In [None]:
!pip install transformers



In [None]:
import torch
from transformers import (AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig)
from langchain.llms import HuggingFacePipeline

In [None]:
# function for loading 4-bit quantized model
def load_quantized_model(model_id: str):
  """
    model_name: Name or path of the model to be loaded.
    return: Loaded quantized model.
  """

  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16

  )

  model = AutoModelForCausalLM.from_pretrained(
      model_id,
      quantization_config=bnb_config,
  )
  return model

In [None]:
# initializing tokenizer
def initialze_tokenizer(model_name: str):
  tokenizer = AutoTokenizer.from_pretrained(model_name, return_token_type_ids=False)
  tokenizer.bos_token_id = 1 # Set beginning of sentence token id
  return tokenizer


In [None]:
tokenizer = initialze_tokenizer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
model = load_quantized_model(model_name)


In [None]:
pipeline = pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",
    max_length=2048,
    do_sample=True,
    top_k=5,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [None]:
llm=HuggingFacePipeline(pipeline=pipeline)

  warn_deprecated(


In [None]:
from langchain.chains import RetrievalQA

In [None]:
normal_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore_retriever,
    return_source_documents=True
)

In [None]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=ensemble_retriever,
    return_source_documents=True
)

In [None]:
response1

NameError: name 'response1' is not defined

In [None]:
response1 = normal_chain.invoke("what is this document all about")


In [None]:
print(response1.get("result"))