In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers
!pip install --upgrade transformers
!pip install llama-index
!pip install llama-index-llms-huggingface
!pip install llama-index-embeddings-huggingface
!pip install -q llama-index-embeddings-huggingface
!pip install -q llama-index-vector-stores-chroma
!pip install llama-index-postprocessor-colbert-rerank

In [None]:
import torch
from transformers import AutoTokenizer,BitsAndBytesConfig,pipeline
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core import Settings
from llama_index.core.tools import QueryEngineTool, ToolMetadata
import json
from typing import Sequence, List

from unsloth import FastLanguageModel
from transformers import TextStreamer
import pandas as pd
from google.colab import drive
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

In [None]:
LOAD_VECTOR_FROM_STORAGE = True

## Model Creation

In [None]:
# https://huggingface.co/unsloth/Meta-Llama-3.1-8B

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3 2x faster!d
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [None]:
FastLanguageModel.for_inference(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,),

## Dataset

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_pickle("/content/drive/MyDrive/Dataset/arXiv/preprocessed_data.zip")

In [None]:
df = df.drop_duplicates(subset='id')

In [None]:
df.head()

Unnamed: 0,id,title,authors,abstract,categories,prepared_text
0,704.1267,Text Line Segmentation of Historical Documents...,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",There is a huge amount of historical documents...,Computer Vision and Pattern Recognition,Text Line Segmentation of Historical Documents...
1,704.1274,Parametric Learning and Monte Carlo Optimization,David H. Wolpert and Dev G. Rajnarayan,This paper uncovers and explores the close rel...,Machine Learning,Parametric Learning and Monte Carlo Optimizati...
2,704.1394,Calculating Valid Domains for BDD-Based Intera...,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...",In these notes we formally describe the functi...,Artificial Intelligence,Calculating Valid Domains for BDD-Based Intera...
3,704.1409,Preconditioned Temporal Difference Learning,Yao HengShuai,This paper has been withdrawn by the author. T...,"Machine Learning, Artificial Intelligence",Preconditioned Temporal Difference Learning Ma...
4,704.1827,Transaction-Oriented Simulation In Ad Hoc Grids,Gerald Krafft,This paper analyses the possibilities of perfo...,"Distributed, Parallel, and Cluster Computing",Transaction-Oriented Simulation In Ad Hoc Grid...


In [None]:
if not LOAD_VECTOR_FROM_STORAGE:
    from llama_index.core import Document

    arxiv_documents = [Document(
        text=item['prepared_text'],
        metadata={"id":item['id']}

    ) for _, item in df.sample(n=32768).iterrows()]

## Vector Store and Reranker

In [None]:
device_type = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_folder="./models", device=device_type)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# https://www.kaggle.com/code/bachngoh/the-ultimate-guide-on-rag-w-gemma-llama-index

chroma_client = chromadb.PersistentClient(path="/content/drive/MyDrive/Dataset/arXiv/DB")
chroma_collection = chroma_client.get_or_create_collection("demo_arxiv")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
if LOAD_VECTOR_FROM_STORAGE:
    index = VectorStoreIndex.from_vector_store(vector_store, storage_context=storage_context, embed_model=embed_model)
else:
    index = VectorStoreIndex.from_documents(arxiv_documents, storage_context=storage_context, embed_model=embed_model, show_progress=True)

In [None]:
query_engine = index.as_retriever(
    similarity_top_k = 5,
    alpha=0.5,
)

In [None]:
for res in query_engine.retrieve("What are some papers that are in the field of difussion model?"):
    print(res.metadata)
    print("=============")

{'id': '2408.00315'}
{'id': '2312.01201'}
{'id': '2202.00391'}
{'id': '2210.03312'}
{'id': '2305.16936'}


In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

Settings.llm = None
rerank_postprocessor = SentenceTransformerRerank(
    model='mixedbread-ai/mxbai-rerank-xsmall-v1',
    top_n=3, # number of nodes after re-ranking,
    keep_retrieval_score=True
)

LLM is explicitly disabled. Using MockLLM.


config.json:   0%|          | 0.00/968 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/142M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/970 [00:00<?, ?B/s]

In [None]:
query_engine = index.as_query_engine(
    similarity_top_k=5,  # Number of nodes before re-ranking
    node_postprocessors=[rerank_postprocessor],
)

In [None]:
metadata = query_engine.query("What are some papers about video generation using diffusion models?").metadata.values()

In [None]:
print([meta['id'] for meta in metadata])

['2310.20700', '2211.11743', '2403.13408']


## Prompt Definition

In [None]:
# Defining the prompt
prompt = """
You are a Question and Answer professional, tasked with answering questions based on the provided context.
Below is an instruction that describes specific rules you must obey, paired with an input that provides further context.
Under the Addotional Context section there are information that may be useful to finish the input.

### Instruction:
  - You must answer the question short and concise but to the point.
  - Use the additional context the best you can without using prior knowledge, if the additional context is not helpful then answer as usual.
  - If you list titles for a specific paper you must also include their authors, first writer would suffice, for example: XX et al.
  - You only need to answer it once, you will always have a friendly tone.
  - Do not mention about the "provided info", treat it as some info you already know.

### Input:
{}

### Additional Context:
{}

### Response:

{}"""

In [None]:
text_streamer = TextStreamer(tokenizer)

In [None]:
def get_response(input,query_engine=query_engine,df = df,stream = False,text_streamer = text_streamer):

    metadata = query_engine.query(input).metadata.values()
    additional_context = ""
    filtered_df = df[df['id'].isin([meta['id'] for meta in metadata])]
    for _, row in filtered_df.iterrows():
      additional_context += f"Paper title: {row['title']}\n"
      additional_context += f"Author: {row['authors']}\n"
      additional_context += f"Abstract: {row['abstract']}\n"
      additional_context += "\n"


    inputs = tokenizer(
    [
        prompt.format(
            input, # input
            additional_context,
            "",# output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    if stream:
        outputs = model.generate(**inputs,streamer = text_streamer, max_new_tokens = 4096)
    else:
        outputs = model.generate(**inputs, max_new_tokens = 4096)
    answer = tokenizer.batch_decode(outputs)[0]
    return answer.split("Response:\n")[-1][:-10]

In [None]:
print(get_response("What is a diffusion model?"))


A diffusion model is a type of generative model that uses a Markov chain to gradually refine a noise signal into a target data distribution. It is typically used for tasks such as image and video generation, as well as data imputation and anomaly detection. Diffusion models have gained popularity in recent years due to their ability to produce high-quality samples and their flexibility in handling complex data distributions. They are often used in conjunction with other machine learning techniques, such as reinforcement learning and self-supervised learning, to improve performance and efficiency. In the context of the provided papers, diffusion models are used for tasks such as time series forecasting, anomaly detection, and image classification, among others. The papers also explore the application of diffusion models in various domains, including healthcare, recommendation, and climate analysis.<|eot_id|>


In [None]:
print(get_response("Give me some paper about using neural networks in database systems.",stream = True))

<|begin_of_text|>
You are a Question and Answer professional, tasked with answering questions based on the provided context.
Below is an instruction that describes specific rules you must obey, paired with an input that provides further context.
Under the Addotional Context section there are information that may be useful to finish the input.

### Instruction:
  - You must answer the question short and concise but to the point.
  - Use the additional context the best you can without using prior knowledge, if the additional context is not helpful then answer as usual.
  - If you list titles for a specific paper you must also include their authors, first writer would suffice, for example: XX et al.
  - You only need to answer it once, you will always have a friendly tone.
  - Do not mention about the "provided info", treat it as some info you already know.

### Input:
Give me some paper about using neural networks in database systems.

### Additional Context:
Paper title: Plan-Structured

# Work In Progress

In [None]:
def task_plan(input, tools = None):
    init_prompt = f"""
    You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

    ## Tools
    You have access to a wide variety of tools. You are responsible for using
    the tools in any sequence you deem appropriate to complete the task at hand.
    This may require breaking the task into subtasks and using different tools
    to complete each subtask.

    You have access to the following functions:

    Use the function 'search_paper' to 'search for related information within a set of reasearch papers using queries'

    If you choose to call a function ONLY reply in the following format after "Action:" with no prefix or suffix:

    Action: <function=example_function_name>{{\"example_name\": \"example_value\"}}</function>

    Reminder:
    - Function calls MUST follow the specified format, start with <function= and end with </function>
    - Required parameters MUST be specified
    - Only call one function at a time
    - Put the entire function call reply on one line

    ## Output Format
    To answer the question, please use the following format.

    ```
    Thought: I need to use a tool to help me answer the question.
    Action: <function=example_function_name>{{\"example_name\": \"example_value\"}}</function>
    ```

    Please ALWAYS start with a Thought.

    Please use a valid JSON format for the Action Input. Do NOT do this {{'input': 'hello world', 'num_beams': 5}}.

    If this format is used, the user will respond in the following format:

    ```
    Observation: tool response
    ```

    You should keep repeating the above format until you have enough information
    to answer the question without using any more tools. At that point, you MUST respond
    in the one of the following two formats:

    ```
    Thought: I can answer without using any more tools.
    Answer: [your answer here]
    ```

    ```
    Thought: I cannot answer the question with the provided tools.
    Answer: Sorry, I cannot answer your query.
    ```

    ## Additional Rules
    - You MUST obey the function signature of each tool. Do NOT pass in no arguments if the function expects arguments.

    ## Current Conversation
    Below is the current conversation consisting of interleaving human and assistant messages.
    {input}
    """
    inputs = tokenizer(
    [
        init_prompt#.format(
        #     tools,
        #     input
        # )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(** inputs,streamer = text_streamer,max_new_tokens = 4096,do_sample=False)
    answer = tokenizer.batch_decode(outputs)[0]
    return answer.split("Response:\n")[-1]



In [None]:
print(task_plan("What is a diffusion model?"))

is<|begin_of_text|>
    You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

    ## Tools
    You have access to a wide variety of tools. You are responsible for using
    the tools in any sequence you deem appropriate to complete the task at hand.
    This may require breaking the task into subtasks and using different tools
    to complete each subtask.

    You have access to the following functions:

    Use the function'search_paper' to'search for related information within a set of reasearch papers using queries'

    If you choose to call a function ONLY reply in the following format after "Action:" with no prefix or suffix:

    Action: <function=example_function_name>{"example_name": "example_value"}</function>

    Reminder:
    - Function calls MUST follow the specified format, start with <function= and end with </function>
    - Required parameters MUST be specified
    - Only call one function at a t

KeyboardInterrupt: 