# Retrieval

In [1]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path().absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

  from tqdm.autonotebook import tqdm


2025-01-14 18:00:57 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 65063.29it/s]

2025-01-14 18:00:57 - Loaded 5183 TEST Documents.
2025-01-14 18:00:57 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers




2025-01-14 18:01:01 - Encoding Queries...


Batches: 100%|██████████| 19/19 [00:00<00:00, 23.95it/s]


2025-01-14 18:01:02 - Sorting Corpus by document length (Longest first)...
2025-01-14 18:01:02 - Scoring Function: Dot Product (dot)
2025-01-14 18:01:02 - Encoding Batch 1/1...


Batches: 100%|██████████| 324/324 [01:19<00:00,  4.06it/s]


2025-01-14 18:02:23 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2025-01-14 18:02:23 - 

2025-01-14 18:02:23 - NDCG@1: 0.5333
2025-01-14 18:02:23 - NDCG@3: 0.5990
2025-01-14 18:02:23 - NDCG@5: 0.6215
2025-01-14 18:02:23 - NDCG@10: 0.6428
2025-01-14 18:02:23 - NDCG@100: 0.6698
2025-01-14 18:02:23 - NDCG@1000: 0.6811
2025-01-14 18:02:23 - 

2025-01-14 18:02:23 - MAP@1: 0.5086
2025-01-14 18:02:23 - MAP@3: 0.5730
2025-01-14 18:02:23 - MAP@5: 0.5892
2025-01-14 18:02:23 - MAP@10: 0.5992
2025-01-14 18:02:23 - MAP@100: 0.6046
2025-01-14 18:02:23 - MAP@1000: 0.6049
2025-01-14 18:02:23 - 

2025-01-14 18:02:23 - Recall@1: 0.5086
2025-01-14 18:02:23 - Recall@3: 0.6473
2025-01-14 18:02:23 - Recall@5: 0.6998
2025-01-14 18:02:23 - Recall@10: 0.7615
2025-01-14 18:02:23 - Recall@100: 0.8910
2025-01-14 18:02:23 - Recall@1000: 0.9833
2025-01-14 18:02:23 - 

2025-01-14 18:02:23 - P@1: 0.5333
2025-01-14 18:02:23

In [2]:
#### Display some queries from the dataset
print("Sample queries from the dataset:")
for i, (query_id, query_text) in enumerate(queries.items()):
    if i >= 5:  # Display only the first 5 queries
        break
    print(f"{i+1}. {query_text}")

#### Display top-3 retrieved documents for a sample query
sample_query = list(queries.values())[0]  # Use the first query from the dataset
query_id = list(queries.keys())[list(queries.values()).index(sample_query)]
top_k = 3
top_results = sorted(results[query_id].items(), key=lambda item: item[1], reverse=True)[:top_k]

print(f"\nQuery: \"{sample_query}\"")
print("Top-3 Retrieved Documents:")
for rank, (doc_id, score) in enumerate(top_results, start=1):
    print(f"{rank}. Document ID: {doc_id}, Score: {score:.2f}, Text: \"{corpus[doc_id]['text']}\"")

Sample queries from the dataset:
1. 0-dimensional biomaterials show inductive properties.
2. 1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.
3. 1/2000 in UK have abnormal PrP positivity.
4. 5% of perinatal mortality is due to low birth weight.
5. A deficiency of vitamin B12 increases blood levels of homocysteine.

Query: "0-dimensional biomaterials show inductive properties."
Top-3 Retrieved Documents:
1. Document ID: 16736872, Score: 94.43, Text: "Optical imaging of the dynamics of living specimens involves tradeoffs between spatial resolution, temporal resolution, and phototoxicity, made more difficult in three dimensions. Here, however, we report that rapid three-dimensional (3D) dynamics can be studied beyond the diffraction limit in thick or densely fluorescent living specimens over many time points by combining ultrathin planar illumination produced by scanned Bessel beams with su

# Generation

In [6]:
# Define the sample query and retrieved documents
query = "0-dimensional biomaterials show inductive properties."
retrieved_docs = {
    16736872: {"score": 94.43, "text": "Optical imaging of the dynamics of living specimens involves tradeoffs between spatial resolution, temporal resolution, and phototoxicity, made more difficult in three dimensions. Here, however, we report that rapid three-dimensional (3D) dynamics can be studied beyond the diffraction limit in thick or densely fluorescent living specimens over many time points by combining ultrathin planar illumination produced by scanned Bessel beams with super-resolution structured illumination microscopy. We demonstrate in vivo karyotyping of chromosomes during mitosis and identify different dynamics for the actin cytoskeleton at the dorsal and ventral surfaces of fibroblasts. Compared to spinning disk confocal microscopy, we demonstrate substantially reduced photodamage when imaging rapid morphological changes in D. discoideum cells, as well as improved contrast and resolution at depth within developing C. elegans embryos. Bessel beam structured plane illumination thus promises new insights into complex biological phenomena that require 4D subcellular spatiotemporal detail in either a single or multicellular context."},
    #4346436: {"score": 92.52, "text": "Unlike most synthetic materials, biological materials often stiffen as they are deformed. This nonlinear elastic response, critical for the physiological function of some tissues, has been documented since at least the 19th century, but the molecular structure and the design principles responsible for it are unknown. Current models for this response require geometrically complex ordered structures unique to each material. In this Article we show that a much simpler molecular theory accounts for strain stiffening in a wide range of molecularly distinct biopolymer gels formed from purified cytoskeletal and extracellular proteins. This theory shows that systems of semi-flexible chains such as filamentous proteins arranged in an open crosslinked meshwork invariably stiffen at low strains without the need for a specific architecture or multiple elements with different intrinsic stiffnesses."},
    #39187170: {"score": 91.91, "text": "Adipose tissue exerts important endocrine and metabolic functions in health and disease. Yet the bioenergetics of this tissue is not characterized in humans and possible regional differences are not elucidated. Using high resolution respirometry, mitochondrial respiration was quantified in human abdominal subcutaneous and intra-abdominal visceral (omentum majus) adipose tissue from biopsies obtained in 20 obese patients undergoing bariatric surgery. Mitochondrial DNA (mtDNA) and genomic DNA (gDNA) were determined by the PCR technique for estimation of mitochondrial density. Adipose tissue samples were permeabilized and respirometric measurements were performed in duplicate at 37 degrees C. Substrates (glutamate (G) + malate (M) + octanoyl carnitine (O) + succinate (S)) were added sequentially to provide electrons to complex I + II. ADP ((D)) for state 3 respiration was added after GM. Uncoupled respiration was measured after addition of FCCP. Visceral fat contained more mitochondria per milligram of tissue than subcutaneous fat, but the cells were smaller. Robust, stable oxygen fluxes were found in both tissues, and coupled state 3 (GMOS(D)) and uncoupled respiration were significantly (P < 0.05) higher in visceral (0.95 +/- 0.05 and 1.15 +/- 0.06 pmol O(2) s(1) mg(1), respectively) compared with subcutaneous (0.76 +/- 0.04 and 0.98 +/- 0.05 pmol O(2) s(1) mg(1), respectively) adipose tissue. Expressed per mtDNA, visceral adipose tissue had significantly (P < 0.05) lower mitochondrial respiration. Substrate control ratios were higher and uncoupling control ratio lower (P < 0.05) in visceral compared with subcutaneous adipose tissue. We conclude that visceral fat is bioenergetically more active and more sensitive to mitochondrial substrate supply than subcutaneous fat. Oxidative phosphorylation has a higher relative activity in visceral compared with subcutaneous adipose tissue."}
}

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load model generasi (T5 atau lainnya)
model_name = "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Function to build dynamic input
def create_input_from_retrieval(query, retrieved_docs, top_k=3):
    # Get top-k documents based on score
    top_docs = sorted(retrieved_docs.items(), key=lambda x: x[1]["score"], reverse=True)[:top_k]

    # Combine query with document texts
    context = " ".join([doc["text"] for _, doc in top_docs])
    input_text = f"context: {context}. based on this, answer the question: {query}"
    print("Input Text:", input_text)
    return input_text

# # Example query and retrieved documents
# query = "What are the functions of adipose tissue?"
# retrieved_docs = {
#     1: {"text": "Adipose tissue serves as a key energy reserve in the human body.", "score": 0.9},
#     2: {"text": "It plays a crucial role in metabolic regulation and hormone secretion.", "score": 0.85},
#     3: {"text": "Subcutaneous and visceral adipose tissues have different physiological roles.", "score": 0.8},
# }

# Build input for generation model
input_text = create_input_from_retrieval(query, retrieved_docs)

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate output with repetition_penalty
output_ids = model.generate(
    inputs.input_ids,
    max_length=50,
    num_beams=5,
    early_stopping=True,
    repetition_penalty=2.0  # Adjust penalty value as needed
)

# Decode output
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("Generated Answer:", answer)

Input Text: context: Optical imaging of the dynamics of living specimens involves tradeoffs between spatial resolution, temporal resolution, and phototoxicity, made more difficult in three dimensions. Here, however, we report that rapid three-dimensional (3D) dynamics can be studied beyond the diffraction limit in thick or densely fluorescent living specimens over many time points by combining ultrathin planar illumination produced by scanned Bessel beams with super-resolution structured illumination microscopy. We demonstrate in vivo karyotyping of chromosomes during mitosis and identify different dynamics for the actin cytoskeleton at the dorsal and ventral surfaces of fibroblasts. Compared to spinning disk confocal microscopy, we demonstrate substantially reduced photodamage when imaging rapid morphological changes in D. discoideum cells, as well as improved contrast and resolution at depth within developing C. elegans embryos. Bessel beam structured plane illumination thus promises