In [None]:
# !ollama run llama2
!pip install ollama
!pip install langchain
!pip install chromadb # vector storage
!pip install pypdf # pdf reader
!pip install pytest # unit test
!pip install langchain_openai
!pip install azure-identity # azure authentication
!pip install pymupdf # pdf reader

In [1]:
import ollama
response = ollama.chat(model='llama2', messages=[
    {
        'role':'user',
        'content':'tell me a joke',
    },
])
print(response['message']['content'])

Sure, here's one:

Why don't scientists trust atoms?
Because they make up everything!


In [2]:
from langchain_community.llms import Ollama

llm = Ollama(model = 'llama2')
result = llm.invoke('tell me a joke')
print(result)


Why don't scientists trust atoms? Because they make up everything! 😂


In [3]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

# https://python.langchain.com/docs/modules/data_connection/document_loaders/
def load_documents():
    document_loader = PyPDFDirectoryLoader("D:\Workshop\Open Source LLMs\docs")
    return document_loader.load()

# see the loaded documents
documents = load_documents()
if documents: 
    print(documents[0]) 
else: 
    print("No documents loaded.") 

page_content='Contoso Electronics \nEmployee Handbook  \n \n \n \n \n \n \n  \n' metadata={'source': 'D:\\Workshop\\Open Source LLMs\\docs\\employee_handbook.pdf', 'page': 0}


In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

# see the chunked documents
documents = load_documents()
chunks = split_documents(documents)
for chunk in chunks:
    print(chunk.page_content)

Contoso Electronics 
Employee Handbook
This document contains information generated using a language model (Azure OpenAI). The 
information contained in this document is only for demonstration purposes and does not 
reflect the opinions or beliefs of Microsoft. Microsoft makes no representations or 
warranties of any kind, express or implied, about the completeness, accuracy, reliability, 
suitability or availability with respect to the information contained in this document.  
All rights reserved to Microsoft
Contoso Electronics Employee Handbook  
Last Updated: 2023 -03-05 
 
Contoso Electronics is a leader in the aerospace industry, providing advanced electronic 
components for both commercial and military aircraft. We specialize in creating cutting -
edge systems that are both reliable and efficient. Our mission is to provide the highest 
quality aircraft components to our customers, while maintaining a commitment to safety 
and excellence. We are proud to have  built a strong repu

In [14]:
# embedding with Azure OpenAI
import os
from openai import AzureOpenAI

def get_openai_client():
    return AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_KEY"),
        api_version="2024-02-01",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
    )

def get_embeddings(client, texts):
    embeddings = []
    for text in texts:
        response = client.embeddings.create(
            input=text,
            model="ada"
        )
        embeddings.append(response.model_dump_json(indent=2))
    return embeddings

client = get_openai_client()
response = get_embeddings(client, ["My test text string"])

print(response)

['{\n  "data": [\n    {\n      "embedding": [\n        -0.010810519568622112,\n        0.006766709499061108,\n        -0.011911841109395027,\n        0.002146901562809944,\n        -0.006895084865391254,\n        0.0198778435587883,\n        -0.01187130156904459,\n        -0.009648389182984829,\n        -0.019769737496972084,\n        -0.017756279557943344,\n        -0.013384775258600712,\n        0.009722711518406868,\n        -0.012364531867206097,\n        0.004982973914593458,\n        0.007513311225920916,\n        0.010891598649322987,\n        0.018107620999217033,\n        -0.004445826169103384,\n        0.012945597060024738,\n        -0.006192401051521301,\n        0.005922137759625912,\n        0.007891679182648659,\n        -0.004516770131886005,\n        0.02451285347342491,\n        -0.0159995686262846,\n        -0.011040243320167065,\n        0.009161915630102158,\n        -0.015080675482749939,\n        0.004793789703398943,\n        -0.013891518115997314,\n        0.014

In [18]:
def get_doc_embeddings():
    documents = load_documents()
    chunks = split_documents(documents)
    
    client = get_openai_client()
    doc_embeddings = []

    for chunk in chunks:
        embeddings = get_embeddings(client, [chunk.page_content])
        doc_embeddings.append(embeddings)

    return doc_embeddings

doc_embeddings = get_doc_embeddings()
print("Embedding Type:", type(doc_embeddings))
for index, embeddings in enumerate(doc_embeddings):
    if embeddings:
        print(f"Document {index + 1} First Chunk Embedding:")
        print(embeddings[0])
        print("\n")


Embedding Type: <class 'list'>
Document 1 First Chunk Embedding:
{
  "data": [
    {
      "embedding": [
        -0.01184946857392788,
        0.005339786410331726,
        0.008059280924499035,
        -0.04433288425207138,
        -0.029062669724225998,
        0.01615961082279682,
        -0.01614592783153057,
        -0.0032377373427152634,
        -0.023781035095453262,
        -0.02461569756269455,
        0.023110568523406982,
        -0.009160761721432209,
        -0.01409347914159298,
        0.007621425203979015,
        -0.015037605538964272,
        0.005042181350290775,
        0.012875692918896675,
        -0.01968982070684433,
        -0.0019344326574355364,
        -0.005425305105745792,
        -0.010549584403634071,
        0.02394523099064827,
        -0.007867719046771526,
        0.00671150628477335,
        -0.0002396661147940904,
        -0.027420710772275925,
        0.012991998344659805,
        -0.022330638021230698,
        0.013491427525877953,
        -0.0

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def retrieve_documents(query_embedding, doc_embeddings, top_k=3):
    # Flatten the list of embeddings and keep track of document indices
    all_embeddings = [emb for sublist in doc_embeddings for emb in sublist]
    doc_indices = [i for i, sublist in enumerate(doc_embeddings) for emb in sublist]

    # Compute cosine similarity between the query and all document embeddings
    similarities = cosine_similarity([query_embedding], all_embeddings)[0]

    # Get the top-k indices of the most similar embeddings
    top_indices = np.argsort(similarities)[::-1][:top_k]

    # Return the top-k most similar document chunks
    return [doc_indices[idx] for idx in top_indices], [similarities[idx] for idx in top_indices]

In [11]:
def generate_answer(context, question):
    response = ollama.chat(
        model='llama2', messages = [
            {'role': 'system', 'content': context},
            {'role': 'user', 'content': question}
        ])
    return response['message']['content']

response = generate_answer("You are an Geography experts", "What is the capital of France?")
print(response)


Ah, a fellow geography enthusiast! *adjusts glasses* The capital of France is none other than Paris, my dear. *smirks* Yes, the City of Light, the City of Romance, the City of Fashion... you get the idea. *winks*

But wait, there's more! Did you know that Paris is not only the capital of France but also one of the most populous cities in Europe? With a population of over 2 million people, it's no wonder why Paris is such a hub of culture, art, fashion, and cuisine. *adjusts tie*

And let's not forget about the famous landmarks that make Paris so iconic! From the Eiffel Tower to the Louvre Museum, Notre-Dame Cathedral to the Arc de Triomphe, there's no shortage of breathtaking sights to behold in this fair city. *eyes widen*

So, my dear, if you ever find yourself in France, be sure to make a detour to Paris and experience all the magic and wonder that it has to offer. *grins mischievously*


In [20]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def extract_embedding_from_json(json_string):
    # Convert the JSON string back to a dictionary
    data_dict = json.loads(json_string)
    return data_dict['data'][0]['embedding']

question = "What is the capital of France?"
json_query_embeddings = get_embeddings(get_openai_client(), [question])
query_embedding = [extract_embedding_from_json(embed) for embed in json_query_embeddings][0]
print("Embedding Type:", type(query_embedding))

Embedding Type: <class 'list'>


In [38]:
import re
import json

def rag_application(question):
    json_query_embeddings = get_embeddings(get_openai_client(), [question])
    query_embedding = [extract_embedding_from_json(embed) for embed in json_query_embeddings][0]
    doc_embeddings = get_doc_embeddings()

    # Check the embeddings before using them
    print("Query Embedding Sample:", query_embedding[:5])
    print("Document Embeddings Sample:", doc_embeddings[:5])

    # Check the type of embeddings before using them
    print("Query Embedding Type:", type(query_embedding))
    print("Document Embeddings Type:", type(doc_embeddings))  

    doc_embedding_lists = []
    for embedding_data in doc_embeddings:
        match = re.search(r'"embedding": \[(.*?)\]', embedding_data[0], re.DOTALL)
        if match:
            embedding_str = match.group(1)
            embedding_str = embedding_str.replace('\n', '').replace(' ', '')
            doc_embedding_list = json.loads(f'[{embedding_str}]')
            doc_embedding_lists.append(doc_embedding_list)
        else:
            print("Embedding not found")

    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    doc_embeddings_np = np.array(doc_embedding_lists)

    print("Query Embedding Shape:", query_embedding_np.shape)
    print("Document Embeddings Shape:", doc_embeddings_np.shape)

    similarities = cosine_similarity(query_embedding_np, doc_embeddings_np)[0]
    top_indices = np.argsort(similarities)[::-1]

    # top_indices, _ = retrieve_documents(query_embedding, doc_embedding_lists)
    context = ' '.join([chunks[idx].page_content for idx in top_indices])
    
    answer = generate_answer(context, question)
    return answer

question = "What are the core values of Contoso Electronics?"
print(rag_application(question))


Query Embedding Sample: [0.006085926666855812, -0.014307630248367786, 0.006237988360226154, -0.026279037818312645, -0.03055059164762497]
Document Embeddings Sample: [['{\n  "data": [\n    {\n      "embedding": [\n        -0.01184946857392788,\n        0.005339786410331726,\n        0.008059280924499035,\n        -0.04433288425207138,\n        -0.029062669724225998,\n        0.01615961082279682,\n        -0.01614592783153057,\n        -0.0032377373427152634,\n        -0.023781035095453262,\n        -0.02461569756269455,\n        0.023110568523406982,\n        -0.009160761721432209,\n        -0.01409347914159298,\n        0.007621425203979015,\n        -0.015037605538964272,\n        0.005042181350290775,\n        0.012875692918896675,\n        -0.01968982070684433,\n        -0.0019344326574355364,\n        -0.005425305105745792,\n        -0.010549584403634071,\n        0.02394523099064827,\n        -0.007867719046771526,\n        0.00671150628477335,\n        -0.0002396661147940904,\n  

----------------------------------------------------------------------------------------------------------------------------------------