In [26]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
import chromadb
from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

## Initialize model, tokenizer, query pipeline

Define the model, the device, and the bitsandbytes configuration.

In [2]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

print(device)

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

cuda:0


Prepare the model and the tokenizer.

In [3]:


time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='cuda:0',
)

time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")

bin c:\Users\User\miniconda3\envs\llama_chroma\lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


Loading checkpoint shards: 100%|██████████| 2/2 [01:07<00:00, 33.53s/it]


Prepare model, tokenizer: 81.787 sec.


In [4]:
time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="cuda:0",)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Prepare pipeline: 1.187 sec.


In [5]:
def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    # adapted from https://huggingface.co/blog/llama2#using-transformers
    time_1 = time()
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    time_2 = time()
    print(f"Test inference: {round(time_2-time_1, 3)} sec.")
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

## Test the query pipeline

We test the pipeline with a query about the meaning of State of the Union (SOTU).

In [6]:
test_model(tokenizer,
           query_pipeline,
           "Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Test inference: 48.851 sec.
Result: Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.
The State of the Union address is an annual speech delivered by the President of the United States to Congress, in which they report on the current state of the union and outline their legislative agenda for the upcoming year. The speech is typically held in the House Chamber of the United States Capitol and is broadcast live on television.


# Retrieval Augmented Generation

### Check the model with a HuggingFace pipeline
We check the model with a HF pipeline, using a query about the meaning of State of the Union (SOTU).

In [7]:
llm = HuggingFacePipeline(pipeline=query_pipeline)
# checking again that everything is working fine
llm(prompt="Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.")

'\nThe State of the Union address is an annual speech given by the President of the United States to a joint session of Congress, typically in January, in which the President reports on the state of the union and outlines legislative priorities for the upcoming year.'

### Ingestion of data using Text loder
We will ingest the newest report


In [8]:
from langchain.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("C:/Users/User/OneDrive/Desktop/LLAMA-Chroma/data/")

docs = loader.load()

### Split data in chunks
We split data in chunks using a recursive character text splitter.

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(docs)

### Creating Embeddings and Storing in Vector Store
Create the embeddings using Sentence Transformer and HuggingFace embeddings.

In [10]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda:0"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [11]:
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

## Initialize chain

In [12]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

### Test the Retrieval-Augmented Generation
We define a test function, that will run the query and time it.

In [14]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

Let's check few queries.

In [15]:
query = "What is the performance of Asia Pacific region? Summarize. Keep it under 200 words."
test_rag(qa, query)

Query: What is the performance of Asia Pacific region? Summarize. Keep it under 200 words.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 222.758 sec.

Result:   The Asia Pacific region achieved total sales of CHF 572.2 million in the 2022/23 financial year, a decrease of 0.3% compared to the previous year. The organic sales growth was 2.2%, driven by good growth in the Pacific market and airport projects in China and India. However, the Asian OEM business recorded a decrease in orders in the second half-year, impacting regional sales. The adjusted EBITDA (operating profit before depreciation and amortization) was CHF 92.6 million, a decrease of 8.5% compared to the previous year.


In [17]:
query = "What is the main product segment that contributed in Asia Pacific? Summarize. Keep it under 200 words."
test_rag(qa, query)

Query: What is the main product segment that contributed in Asia Pacific? Summarize. Keep it under 200 words.



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Inference time: 192.002 sec.

Result:   Based on the provided information, the main product segment that contributed to the Asia Pacific region was the Pacific market, which includes growth in the RELBDA business. This segment achieved organic sales growth of CHF 12.1 million, a 2.2% year-on-year increase. The project business in India also showed strong sales in the first half of the financial year, while the Asian OEM business recorded a decrease in orders in the second half-year, impacting regional sales.


## Document sources and score similarities

In [18]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")

Query: What is the main product segment that contributed in Asia Pacific? Summarize. Keep it under 200 words.
Retrieved documents: 4
Source:  C:\Users\User\OneDrive\Desktop\LLAMA-Chroma\data\Annual-Report-2022-_-2023-en.pdf
Text:  Region Asia Pacific
Region Asia Pacific comprises the former segment Access Solutions Asia Pacific (AS APAC) 
and the Market Middle East which used to be a part of the former segment Access Solution 
Europe Middle East & Africa (AS EMEA). All figures were consolidated according to the 
operating model valid until and including 30 June 2023.
Region Asia Pacific achieved total sales of CHF  572.2 million in the 2022/23 financial year 
compared to CHF 574.0 million in the previous year. Organic sales growth was CHF 12.1 
million, a 2.2% year-on-year increase. Organic growth was secured by good growth in the 
Pacific market (including growth in the RELBDA business); additionally, airport projects in 
China and India supported overall sales. The project business i

In [19]:
docs = vectordb.similarity_search_with_score(query)

In [22]:
docs

[(Document(page_content='Region Asia Pacific\nRegion Asia Pacific comprises the former segment Access Solutions Asia Pacific (AS APAC) \nand the Market Middle East which used to be a part of the former segment Access Solution \nEurope Middle East & Africa (AS EMEA). All figures were consolidated according to the \noperating model valid until and including 30 June 2023.\nRegion Asia Pacific achieved total sales of CHF \u202f572.2 million in the 2022/23 financial year \ncompared to CHF 574.0 million in the previous year. Organic sales growth was CHF 12.1 \nmillion, a 2.2% year-on-year increase. Organic growth was secured by good growth in the \nPacific market (including growth in the RELBDA business); additionally, airport projects in \nChina and India supported overall sales. The project business in India showed strong sales in \nthe first half of the financial year. The Asian OEM business recorded a decrease in orders in \nthe second half-year, impacting regional sales. Overall, the se

In [25]:
docs = vectordb.similarity_search_with_score(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")

for doc, score in docs:  # assuming docs is a list of tuples (document, score)
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")
    print("Similarity Score: ", score)  # Printing the similarity score for each document


Query: What is the main product segment that contributed in Asia Pacific? Summarize. Keep it under 200 words.
Retrieved documents: 4
Source:  C:\Users\User\OneDrive\Desktop\LLAMA-Chroma\data\Annual-Report-2022-_-2023-en.pdf
Text:  Region Asia Pacific
Region Asia Pacific comprises the former segment Access Solutions Asia Pacific (AS APAC) 
and the Market Middle East which used to be a part of the former segment Access Solution 
Europe Middle East & Africa (AS EMEA). All figures were consolidated according to the 
operating model valid until and including 30 June 2023.
Region Asia Pacific achieved total sales of CHF  572.2 million in the 2022/23 financial year 
compared to CHF 574.0 million in the previous year. Organic sales growth was CHF 12.1 
million, a 2.2% year-on-year increase. Organic growth was secured by good growth in the 
Pacific market (including growth in the RELBDA business); additionally, airport projects in 
China and India supported overall sales. The project business i