In [None]:
!pip install -q -U llama_index
!pip install -q -U google-generativeai
!pip install pypdf
!pip install -q sentence-transformers
!pip install -q transformers bitsandbytes accelerate
!pip install -U bitsandbytes



In [None]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode


import os
hugging_face_token = os.environ.get("HF_TOKEN")

In [None]:
TRAIN_FILES = ["/content/train.pdf"]
VAL_FILES = ["/content/val.pdf"]

TRAIN_CORPUS_FPATH = "/content/train_corpus.json"
VAL_CORPUS_FPATH = "/content/val_corpus.json"

In [None]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [None]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['/content/train.pdf']
Loaded 8 docs


Parsing nodes:   0%|          | 0/8 [00:00<?, ?it/s]

Parsed 8 nodes
Loading files ['/content/val.pdf']
Loaded 1 docs


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Parsed 1 nodes


In [None]:
train_nodes[0].text

'IntroductionThis document is provided for information and is not guidance. It aims to provide a summary ofinformation on ventilation to mitigate the risk of COVID-19 transmission, with a focus on advice andinformation that has relevance for dental facilities. The information has been compiled from several keypublicly available documents developed by other organisations and expert groups, including the ScientiﬁcAdvisory Group for Emergencies (SAGE) Environmental and Modelling group (EMG) and NHS NationalServices for Scotland (NSS). This resource was developed with input from expert members of the SDCEPAerosol Generating Procedures Working Group, and end-users.While this summary document might not fully address all aspects of queries on ventilation, it reﬂects thecurrently available information. The key sources are listed at the end of the document and have not beenformally appraised for this summary. Direct quotes are shown in italics. For an understanding of the basisfor the informati

In [None]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

In [None]:
import pathlib
import textwrap
import google.generativeai as genai
# Used to securely store your API key
from google.colab import userdata
from IPython.display import display
from IPython.display import Markdown

In [None]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
import torch

# Load the Llama 2 model with quantization using bitsandbytes
model_name = "meta-llama/Llama-2-7b-hf"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model with 4-bit quantization
nf4_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=nf4_config)

In [None]:
def complete_query(prompt):
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate the output using the model
    outputs = model.generate(
        inputs["input_ids"].to('cuda'),
        max_new_tokens=250,
        eos_token_id=tokenizer.eos_token_id,
        temperature=0.5,  # Control randomness
        top_p=0.9,        # Nucleus sampling
        top_k=50,         # Top-k sampling
        no_repeat_ngram_size=2  # Prevent repetition of 2-grams
    )

    # Decode the generated output and skip special tokens
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Ensure the generated response doesn't repeat the prompt
    if response.startswith(prompt):
        response = response[len(prompt):].strip()

    # Return the result
    return response


In [None]:
DEFAULT_QA_GENERATE_PROMPT_TMPL = """A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.

### Human: Generate 2 exam questions based only on the excerpt from textbook below denoted within tripple backticks:

```It is essential to have a good knowledge of tooth structure in order to understand both the nature of the defects and diseases that can occur and to then make rational decisions on their prevention, treatment, and repair.
Teeth are composed of four different tissues: enamel, dentine, dental pulp, and cementum. Each of these is made up of structural elements found elsewhere in the body, but arranged in unique ways.
In the brief description that follows, a basic knowledge of the embryology and histology of the developing tooth is assumed. Readers interested in further information are referred to the reading list at the end of this chapter.
Tooth Structure
W. R. Hume, G. C. Townsend
```

format your answer by following format ``1) question1? 2) question2?`` make sure to keep the numbers.

### Assistant: 1) What are the four tissues that make up a tooth's structure? 2) Why is understanding tooth structure important for treating dental defects?

### Human: Generate 2 exam questions based only on the excerpt from textbook below denoted within tripple backticks:

```Basic Principles for Cavity Design
G. J. Mount
When a caries lesion has progressed to the point where it is beyond remineralization and healing, it is imperative to remove that part which is broken down and place a restorative material. If the restoration is to be retained for the long term and successfully restore the tooth to its original form and function, there are a number of factors to be taken into account. This chapter discusses those factors that, in particular, relate to the effective retention of the material within the tooth, as well as the problems of protection of remaining tooth structure that may be weakened by the ravages of caries.
No material is universal, and correct selection is important to ensure longevity. In the following chapters, the three principal plastic restorative materials will be discussed in sufficient detail to enable the clinician to make a logical choice as to which material to select for each restorative problem.
```

format your answer by following format ``1) question1? 2) question2?`` make sure to keep the numbers.

### Assistant: 1) What is the purpose of restoring a cavity?  2) Why does the choice of a material matter?

### Human: " Generate 2 exam questions based only on the excerpt from textbook below denoted within tripple backticks:

```{context_str}```

format your answer by following format ``1) question1? 2) question2?`` make sure to keep the numbers.

### Assistant:"""

In [None]:
text = "Basic Principles for \nCavity Design\nG. J. Mount 10\nWhen a caries lesion has pro-\ngressed to the point whereit is beyond remineralisa-\ntion and healing it is imperative toremove that part which is brokendown and place a restorative materi-al. If the restoration is to be retainedfor the long term and successfullyrestore the tooth to its original formand function there are a number offactors to be taken into account. Thischapter discusses those factors that, inparticular, relate to the effective reten-tion of the material within the tooth aswell as the problems of protection ofremaining tooth structure that may beweakened by the ravages of caries.\nNo material is universal and correct\nselection is important to ensurelongevity. In the following chaptersthe three principle plastic restorativematerials will be discussed in suffi-cient detail to enable the clinician tomake a logical choice as to whichmaterial to select for each restorativeproblem."
query = DEFAULT_QA_GENERATE_PROMPT_TMPL.format(
            context_str=text, num_questions_per_chunk=2
        )
result = complete_query(query)
result



'What does this exceprt mean? "When\na carries lesions hasprogress to point beyondremineralsation andhealing"\nWhat does "he" refer to? What\ndoes "it" mean in this sentence? How\nis the sentence structured? Is there\nanything missing? Explain.  1. "No\nmaterial is universa"  What do\nyou think the author is trying to say\nhere? Why do you think it\'\nimportant to know this? Can you\nthink of any examples of when this\nwould be important? Give an\nexample. How does it relate\nto the previous sentence in\nthis paragraph? Does this make\nsense? If not, explain. What would\nhappen if the "broken down" part\nwasn\'t removed? Would the\n"remainning toot structure" be\nprotected? Could the cavities be  "\nhealed? Or would the teeth\nbecome weaker? Do you agree\nwith the authors statement? Reason\nwhy or why not. Explai'

In [None]:
import json
import re
import uuid
from typing import Dict, List, Tuple

from tqdm import tqdm

from llama_index.core.schema import MetadataMode, TextNode

from IPython.display import display,Markdown

def generate_qa_embedding_pairs(
    nodes: List[TextNode],
    qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk: int = 2,
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes."""
    node_dict = {
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE)
        for node in nodes
    }

    queries = {}
    relevant_docs = {}
    for node_id, text in tqdm(node_dict.items()):
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )

        response = complete_query(query)

        result2 = response.strip().split("?")
        questions = [
            (question.strip()+"?") for question in result2 if len(question) > 0
        ]

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

    # construct dataset
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes)
train_dataset.save_json("train_dataset.json")

val_dataset = generate_qa_embedding_pairs(val_nodes)
val_dataset.save_json("val_dataset.json")

100%|██████████| 8/8 [01:12<00:00,  9.05s/it]
100%|██████████| 1/1 [00:07<00:00,  7.84s/it]


In [None]:
from llama_index.legacy.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.legacy.embeddings import resolve_embed_model
import torch

base_embed_model = resolve_embed_model("local:BAAI/bge-small-en")

finetune_engine = EmbeddingAdapterFinetuneEngine(
    train_dataset,
    base_embed_model,
    model_output_path="model_output_test", # saves in ./model_output_test file
    # bias=True,
    epochs=5,
    verbose=True,
    # optimizer_class=torch.optim.SGD,
    # optimizer_params={"lr": 0.01}
)

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
finetune_engine.finetune()

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 2.0431697368621826
[0m[1;3;34m> [Epoch 0] Current loss: 1.9553437232971191
[0m

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

[1;3;34m> [Epoch 1] Current loss: 2.0411391258239746
[0m[1;3;34m> [Epoch 1] Current loss: 1.9466005563735962
[0m

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

[1;3;34m> [Epoch 2] Current loss: 2.037214756011963
[0m[1;3;34m> [Epoch 2] Current loss: 1.9396120309829712
[0m

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

[1;3;34m> [Epoch 3] Current loss: 2.0344595909118652
[0m[1;3;34m> [Epoch 3] Current loss: 1.9346957206726074
[0m

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

[1;3;34m> [Epoch 4] Current loss: 2.0327348709106445
[0m[1;3;34m> [Epoch 4] Current loss: 1.931930422782898
[0m[1;3;34m> Finished training, saving to model_output_test
[0m

In [None]:
embed_model = finetune_engine.get_finetuned_model()
torch.save(embed_model.state_dict(), "./myembedmodel.torch")

AttributeError: 'AdapterEmbeddingModel' object has no attribute 'state_dict'

In [None]:
# from llama_index import ServiceContext, VectorStoreIndex
# from llama_index.schema import TextNode
# from tqdm.notebook import tqdm
# import pandas as pd

ImportError: cannot import name 'ServiceContext' from 'llama_index' (unknown location)

In [None]:
# def evaluate(
#     dataset,
#     embed_model,
#     top_k=5,
#     verbose=False,
# ):
#     corpus = dataset.corpus
#     queries = dataset.queries
#     relevant_docs = dataset.relevant_docs

#     service_context = ServiceContext.from_defaults(llm = None, embed_model=embed_model)
#     nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
#     index = VectorStoreIndex(
#         nodes,  service_context=service_context, show_progress=True
#     )
#     retriever = index.as_retriever(similarity_top_k=top_k)

#     eval_results = []
#     for query_id, query in tqdm(queries.items()):
#         retrieved_nodes = retriever.retrieve(query)
#         retrieved_ids = [node.node.node_id for node in retrieved_nodes]
#         expected_id = relevant_docs[query_id][0]

#         rank = None
#         for idx, id in enumerate(retrieved_ids):
#             if id == expected_id:
#                 rank = idx + 1
#                 break

#         is_hit = rank is not None  # assume 1 relevant doc
#         mrr = 0 if rank is None else 1 / rank

#         eval_result = {
#             "is_hit": is_hit,
#             "mrr": mrr,
#             "retrieved": retrieved_ids,
#             "expected": expected_id,
#             "query": query_id,
#         }
#         eval_results.append(eval_result)
#     return eval_results

# def display_results(names, results_arr):
#     """Display results from evaluate."""

#     hit_rates = []
#     mrrs = []
#     for name, results in zip(names, results_arr):
#         results_df = pd.DataFrame(results)
#         hit_rate = results_df["is_hit"].mean()
#         mrr = results_df["mrr"].mean()
#         hit_rates.append(hit_rate)
#         mrrs.append(mrr)

#     final_df = pd.DataFrame(
#         {"retrievers": names, "hit_rate": hit_rates, "mrr": mrrs}
#     )
#     display(final_df)

In [None]:
# base_val_results = evaluate(val_dataset, base_embed_model)

In [None]:
# ft_val_results = evaluate(val_dataset, embed_model)

In [None]:
# display_results(
#     [ "base", "fine-tuned"], [ base_val_results, ft_val_results]
# )