In [None]:
import pandas as pd


In [None]:

df1 = pd.read_csv('combined_data.csv')
# df2 = pd.read_csv('som.csv')

In [None]:
df1

### convert df to json 

In [None]:
from llama_index.core import Document # type: ignore

train = [
    Document(
        text=row['clean'],
        description= row['abstract'],
        metadata={
            'Title': row['Title'],
            'urls': row['urls'],
            'refs': row['refs'],
            'authors': row['authors']
        }
    )
    for _, row in df1.iterrows()
]
# eval = [
#     Document(
#         text=row['clean'],
#         description= row['abstract'],
#         metadata={
#             'Title': row['Title'],
#             'urls': row['urls'],
#             'refs': row['refs']
#         }
#     )
#     for _, row in df2.iterrows()
# ]

### allow async loops

In [None]:
import nest_asyncio
nest_asyncio.apply()

### Generate Corpus

In [None]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode

TRAIN_FILES = train
VAL_FILES = eval

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser
from llama_index.core.schema import MetadataMode 

def load_corpus(docs, verbose=False):
    if verbose:
        print(f"Loading files in {docs}")
    print(docs)
    if verbose:
        print(f"Loaded {len(docs)} docs")
    parser = LangchainNodeParser(RecursiveCharacterTextSplitter())
    nodes = parser.get_nodes_from_documents(docs)
    if verbose:
        print(f"Parsed {len(nodes)} nodes")
    return nodes

train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

# Generate synthetic queries

In [1]:
%pip install llama-index-llms-openai -q
%pip install llama-index-embeddings-openai -q
%pip install llama-index-finetuning -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

### Setting up Llama2

In [None]:
%pip install llama-index-llms-huggingface -q -U
%pip install llama-index-embeddings-huggingface -q -U
!pip install llama-index ipywidgets -q -U

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


from IPython.display import Markdown, display

In [None]:
%pip install bitsandbytes accelerate -q -U

In [None]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate
from transformers import BitsAndBytesConfig     
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import Accelerator


# Model names (make sure you have access on HF)
LLAMA2_7B = "meta-llama/Llama-2-7b-hf"
LLAMA2_7B_CHAT = "meta-llama/Llama-2-7b-chat-hf"
LLAMA2_13B = "meta-llama/Llama-2-13b-hf"
LLAMA2_13B_CHAT = "meta-llama/Llama-2-13b-chat-hf"
LLAMA2_70B = "meta-llama/Llama-2-70b-hf"
LLAMA2_70B_CHAT = "meta-llama/Llama-2-70b-chat-hf"

selected_model = LLAMA2_7B_CHAT

SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
- Generate human readable output, avoid creating output with gibberish text.
- Generate only the requested output, don't include any other language before or after the requested output.
- Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
- Generate professional language typically used in business documents in North America.
- Never generate offensive or foul language.
"""

query_wrapper_prompt = PromptTemplate(
    "[INST]<>\n" + SYSTEM_PROMPT + "<>\n\n{query_str}[/INST] "
)
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=2048,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=selected_model,
    model_name=selected_model,
    device_map="auto",
    model_kwargs={"quantization_config": quantization_config},
)

### dataset generation

In [None]:
train_dataset = generate_qa_embedding_pairs(train_nodes, llm=llm)
train_dataset.save_json("train_dataset.json")
train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = generate_qa_embedding_pairs(val_nodes, llm=llm)
val_dataset.save_json("val_dataset.json")       
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

### Fine-tuning BAAI/bge-small-en-v1.5

In [None]:
%pip install sentence_transformers -q -U

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset, # Dataset to be trained on
    model_id="BAAI/bge-small-en-v1.5", # HuggingFace reference to base embeddings model
    model_output_path="llama_model_v1", # Output directory for fine-tuned embeddings model
    val_dataset=val_dataset, # Dataset to validate on
    epochs=2 # Number of Epochs to train for
)
finetune_engine.finetune()
finetuned_embedding_model = finetune_engine.get_finetuned_model()


### evaluate embeddinggs

In [None]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path

def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs, name=name)
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

In [None]:
evaluate_st(val_dataset, "BAAI/bge-small-en-v1.5", name="bge")

In [None]:
evaluate_st(val_dataset, "llama_model_v1", name="finetuned")

### Advanced Retrieval Method: Sentence Window Retrieval

In [None]:
%pip install llama-index-embeddings-huggingface -q

In [None]:
from llama_index.core import ServiceContext, set_global_service_context
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser

# window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=6,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)
# base Query Engine LLM
# llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

# fine-tuned Embeddings model
embed_model = HuggingFaceEmbedding(
   model_name="llama_model_v1"
)

# # base Embeddings model
# embed_model_base = HuggingFaceEmbedding(
#     model_name="BAAI/bge-small-en"
# )

# fine-tuned ServiceContext
ctx = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

# # base ServiceContext
# ctx_base = ServiceContext.from_defaults(
#     llm=llm,
#     embed_model=embed_model_base
# )


nodes = node_parser.get_nodes_from_documents(TRAIN_FILES)

In [None]:
nodes

In [None]:
from llama_index import VectorStoreIndex
sentence_index = VectorStoreIndex(nodes, service_context=ctx)

In [None]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

query_engine = sentence_index.as_query_engine(
    similarity_top_k=3,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [None]:
window_response = query_engine.query("How does shm work?")

In [None]:
window_response.response