### Install requirements and import all necessary packages

In [3]:
! pip install -r requirements.txt

Collecting langchain (from -r requirements.txt (line 5))
  Downloading langchain-0.1.9-py3-none-any.whl (816 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.0/817.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting lancedb (from -r requirements.txt (line 7))
  Downloading lancedb-0.5.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rank_bm25 (from -r requirements.txt (line 8))
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting sentence-transformers (from -r requirements.txt (line 9))
  Downloading sentence_transformers-2.4.0-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes (from -r requirements.txt (line 10))
  Downloading bitsandbytes-0.42.0-py3-none-any.whl (105.0 MB)
[2K     [90m━━━━━━━━━━━

In [1]:
import pandas as pd
import os
import lancedb
from torch import cuda
import urllib.request

from langchain.retrievers import EnsembleRetriever
from langchain.schema import Document
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain_community.vectorstores.lancedb import LanceDB
from langchain_community.retrievers import BM25Retriever
from langchain_community.llms import LlamaCpp

from langchain_core.documents.base import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_core.prompts import ChatPromptTemplate



ModuleNotFoundError: No module named 'pwd'

In [57]:
# Remove db if something changed in the structure
# !rm -rf /app/db

### Settings to run the solution

In [58]:
path_to_data_csv = 'master_without_embeddings_first_100.csv'

path_to_database = '/app/db'

embedding_model = 'sentence-transformers/all-MiniLM-L6-v2'

HF_AUTH = os.getenv('HF_AUTH', None)
os.environ['HF_HOME'] = os.getenv('HF_HOME', 'models')
model_id='llama-2-7b-chat.Q2_K.gguf'

chunk_size = 400
chunk_overlap = 50

retrieve_top_k_docs_bm25 = 1
retrieve_top_k_docs_vector = 1
context_length_for_llm = chunk_size*(retrieve_top_k_docs_bm25 + retrieve_top_k_docs_vector)+200 #not larger than 2048
retrievers_weights_bm25 = 0.4 #probability
llama_temperature = 0.75 #randomness parameter

### Load the data into type Document

In [59]:
df = pd.read_csv(path_to_data_csv)

documents=[]
for index, row in df.iterrows():
    doc = Document(page_content = row['chunk'],
                   metadata={'id': row['id'], 'title': row['title'], 'authors': row['authors'], 'sources': row['sources']})
    documents.append(doc)

print(f'---\n--- Read {len(documents)} documents from {path_to_data_csv}')

---
--- Read 412 documents from master_without_embeddings_first_100.csv


### Create BM25- and LanceDB retrievers

In [60]:
print(f'---\n--- Creating retrievers...')

bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k =  retrieve_top_k_docs_bm25

device = 'cuda' if cuda.is_available() else 'cpu'

# Create embedding
embed_model = HuggingFaceEmbeddings(
    model_name=embedding_model,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

# Try if the LanceDB exists, if yes, use if, if no, create new one
try:
    print("--- Trying to connect to LanceDB")
    db = lancedb.connect(path_to_database)
    table = db.open_table("chatmaja_test")
    docsearch = LanceDB(connection=table, embedding=embed_model)
    print("--- LanceDB found, connected successfully")
except:
    print("--- Error connecting to LanceDB, creating new one")
    db = lancedb.connect(path_to_database)
    table = db.create_table("chatmaja_test", data=[
            {"vector": embed_model.embed_query("Hello World"), "text": "Hello World", "id": "1", "authors": "authoors", "sources": "sourcees", "title": "tiitle"}
        ], mode="overwrite")
    print("--- LanceDB created and connected successfully")
    docsearch = LanceDB.from_documents(documents, embed_model, connection=table)
    print("--- Finished loading documents to LanceDB")

retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": retrieve_top_k_docs_vector})

# Create ensemble retriver
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever_lancedb],
                                       weights=[retrievers_weights_bm25, 1-retrievers_weights_bm25])

print("---\n--- Created BM25 and vector search retrievers")

---
--- Creating retrievers...
--- Trying to connect to LanceDB
--- Error connecting to LanceDB, creating new one
--- LanceDB created and connected successfully
--- Finished loading documents to LanceDB
---
--- Created BM25 and vector search retrievers


### Get model

In [None]:
# Create directory if it does not exist
os.makedirs(os.getenv('HF_HOME'), exist_ok=True)

# Download model if not exists
path_to_model = os.path.join(os.getenv('HF_HOME'), model_id)
link_to_model = f"https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/{model_id}"

if not os.path.isfile(path_to_model):
    print(f"--- Downloading {model_id}...")
    urllib.request.urlretrieve(link_to_model, path_to_model)
    print(f"--- Downloaded {model_id} successfully.")
else:
    print(f"--- Model {model_id} already downloaded.")


# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Make sure the model path is correct for your system!
n_gpu_layers = -1 if device == 'cuda' else 0
llm = LlamaCpp(
    model_path=path_to_model,
    temperature=llama_temperature,
    max_tokens=min(context_length_for_llm*2, 4096),
    n_gpu_layers=n_gpu_layers,
    n_ctx=min(context_length_for_llm, 2048),
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

### Create pipeline of the solution

In [61]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

# Prompt
rag_prompt_llama = ChatPromptTemplate.from_messages([
    ("human", """[INST]<<SYS>> You are an assistant for ques
     tion-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]"""),
])

# Chain
chain = (
    RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    | rag_prompt_llama
    | llm
    | StrOutputParser()
)

def answer_query(question):
    """
    Get answer for provided question.

    Args:
        question (str): question from the user.
    """
    print(f'- - - Question: {question}')
    docs = ensemble_retriever.get_relevant_documents(question)
    print(f'- - - Relevant documents: {[d.page_content for d in docs]}')
    result = chain.invoke({"context": docs, "question": question})
    print(f'- - - Results: {result}')
    answer = f"Query: {question}\n\nAnswer: {result}"
    return answer, docs

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32           

--- Model llama-2-7b-chat.Q2_K.gguf already downloaded.


llm_load_tensors: offloading 32 repeating layers to GPU
llm_load_tensors: offloading non-repeating layers to GPU
llm_load_tensors: offloaded 33/33 layers to GPU
llm_load_tensors:        CPU buffer size =    41.02 MiB
llm_load_tensors:      CUDA0 buffer size =  2653.31 MiB
.................................................................................................
llama_new_context_with_model: n_ctx      = 1000
llama_new_context_with_model: freq_base  = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init:      CUDA0 KV buffer size =   500.00 MiB
llama_new_context_with_model: KV self size  =  500.00 MiB, K (f16):  250.00 MiB, V (f16):  250.00 MiB
llama_new_context_with_model:  CUDA_Host input buffer size   =     0.16 MiB
llama_new_context_with_model:      CUDA0 compute buffer size =     1.51 MiB
llama_new_context_with_model:  CUDA_Host compute buffer size =     0.12 MiB
llama_new_context_with_model: graph splits (measure): 2
AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | A

### Sample usage

In [62]:
query = "What is used in brain cancer imaging?"
answer, docs = answer_query(query)

- - - Question: What is used in brain cancer imaging?
- - - Relevant documents: ["Two independent reviewers screened abstracts, titles and full text, resolving differences through discussion. RESULTS: 228 studies met the criteria. XAI publications are increasing, targeting MRI (n = 73), radiography (n = 47), CT (n = 46). Lung (n = 82) and brain (n = 74) pathologies, Covid-19 (n = 48), Alzheimer's disease (n = 25), brain tumors (n = 15) are the main pathologies explained. Explanations are presented visually (n = 186),", 'BACKGROUND: Transformer-based models are gaining popularity in medical imaging and cancer imaging applications. Many recent studies have demonstrated the use of transformer-based models for brain cancer imaging applications such as diagnosis and tumor segmentation. OBJECTIVE: This study aims to review how different vision transformers (ViTs) contributed to advancing brain cancer diagnosis and tumor segmentation using brain image data. This study examines the different a


llama_print_timings:        load time =      84.83 ms
llama_print_timings:      sample time =     144.12 ms /   263 runs   (    0.55 ms per token,  1824.87 tokens per second)
llama_print_timings: prompt eval time =    4236.79 ms /   370 tokens (   11.45 ms per token,    87.33 tokens per second)
llama_print_timings:        eval time =   10972.62 ms /   262 runs   (   41.88 ms per token,    23.88 tokens per second)
llama_print_timings:       total time =   16814.59 ms /   632 tokens


- - - Results:   Based on the provided context, it appears that transformer-based models have been increasingly used in brain cancer imaging for various tasks such as diagnosis and tumor segmentation. Specifically, the study found that:
* 74% of the studies used transformer-based models for brain cancer diagnosis, while 55% used them for tumor segmentation.
* The most common type of transformer used was the ViT, which was employed by 82% of the studies.
* The study found that transformer-based models were particularly useful for analyzing brain tumors, with 70% of the studies using them for this purpose.
* In terms of visual explanations, 186 were provided in the studies reviewed, with the majority (72%) being images.
Overall, the study suggests that transformer-based models have shown promise in advancing brain cancer diagnosis and tumor segmentation using brain image data, and may be a valuable tool in the field of medical imaging. However, it is important to note that the study was 

In [63]:
# Get link to PubMed of first document.
docs[0].metadata['sources']

'https://pubmed.ncbi.nlm.nih.gov/37976760/'

In [66]:
# Get title of first document.
docs[0].metadata['title']

'A scoping review of interpretability and explainability concerning artificial intelligence methods in medical imaging.'

In [67]:
# Get authors of first document.
docs[0].metadata['authors']

'Champendal M||Muller H||Prior JO||Dos Reis CS'

In [50]:
# Print answer generated by llama.
answer

'Query: What is used in brain cancer imaging?\n\nAnswer:   Based on the provided context, here is the answer to the question "What is used in brain cancer imaging?"\nVision Transformers (ViTs) have gained popularity in medical imaging and cancer imaging applications, including brain cancer diagnosis and tumor segmentation. Many recent studies have demonstrated the use of transformer-based models for brain cancer imaging applications such as diagnosis and tumor segmentation. These models have contributed significantly to advancing brain cancer diagnosis and tumor segmentation using brain image data. The study examines different architectures developed for enhancing the task of brain tumor segmentation and explores their effectiveness in brain cancer imaging.\nIn summary, ViTs are increasingly being used in brain cancer imaging for tasks such as diagnosis and tumor segmentation, and have shown promising results in improving these tasks.\nSources:\n1. "A Survey on Vision Transformers in M