Using https://learnbybuilding.ai/tutorials/rag-from-scratch as a tutorial

In [None]:
""" Baby's First RAG """

corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]

def jaccard_similarity(query, document) -> float:
    """ Pre-processes plain strings into a set to perform comparisons.

    Args:
        query (_type_): _description_
        document (_type_): _description_

    Returns:
        float: _description_
    """
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def return_response(query, corpus) -> str:
    """ Selects the best document to return to the user

    Args:
        query (_type_): _description_
        corpus (_type_): _description_

    Returns:
        str: _description_
    """
    similarities = []
    for doc in corpus:
        similarity = jaccard_similarity(user_input, doc)
        similarities.append(similarity)
    return corpus_of_documents[similarities.index(max(similarities))]



In [None]:
user_prompt = "What is a leisure activity that you like?"
user_input = "I like to be with friends"
return_response(user_input, corpus_of_documents)


In [None]:
import requests
import json
import ollama
ollama.pull('mistral')


In [None]:

user_input = "I like to see my friends"
relevant_document = return_response(user_input, corpus_of_documents)
full_response = []
# https://github.com/jmorganca/ollama/blob/main/docs/api.md
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

url = 'http://localhost:11434/api/generate'
data = {
    "model": "mistral:latest",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}


In [None]:
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)


In [None]:
try:
    count = 0
    for line in response.iter_lines():
        # filter out keep-alive new lines
        # count += 1
        # if count % 5== 0:
        #     print(decoded_line['response']) # print every fifth token
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            if 'response' in decoded_line:
                full_response.append(decoded_line['response'])
            else:
                print("Warning: 'response' key not found in the data:", decoded_line)
finally:
    response.close()
print(''.join(full_response))

In [None]:
user_input = "I don't like to hike"
relevant_document = return_response(user_input, corpus_of_documents)
# https://github.com/jmorganca/ollama/blob/main/docs/api.md
full_response = []
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""
url = 'http://localhost:11434/api/generate'
data = {
    "model": "mistral:latest",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
try:
    for line in response.iter_lines():
        # filter out keep-alive new lines
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])  # uncomment to results, token by token
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

This ends the tutorial at https://learnbybuilding.ai/tutorials/rag-from-scratch 
Now to go into https://learnbybuilding.ai/tutorials/rag-from-scratch-part-2-semantics-and-cosine-similarity

In [None]:
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(corpus_of_documents)

In [None]:
print(doc_embeddings)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
query = "What's the best activity to do with friends?"
query_embedding = model.encode([query])
similarities = cosine_similarity(query_embedding, doc_embeddings)
print(similarities)

In [None]:
indexed = list(enumerate(similarities[0]))
sorted_index = sorted(indexed, key=lambda x: x[1], reverse=True)
print(sorted_index)

In [None]:
recommended_documents = []
for value, score in sorted_index:
    formatted_score = "{:.2f}".format(score)
    print(f"{formatted_score} => {corpus_of_documents[value]}")
    if score > 0.3:
        recommended_documents.append(corpus_of_documents[value])

In [None]:
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
These are potential activities:
{recommended_activities}
The user's query is: {user_input}
Provide the user with 2 recommended activities based on their query.
"""
recommended_activities = "\n".join(recommended_documents)
user_input = "I like to spend time with my friends"
full_prompt = prompt.format(user_input=user_input, recommended_activities=recommended_activities)
url = 'http://localhost:11434/api/generate'
data = {
    "model": "mistral:latest",
    "prompt": full_prompt
}
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
full_response=[]
try:
    count = 0
    for line in response.iter_lines():
        #filter out keep-alive new lines
        # count += 1
        # if count % 5== 0:
        #     print(decoded_line['response']) # print every fifth token
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

## Trying to isolate embedding before putting into Weaviate

In [None]:
pip install -r ../requirements.in

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Title, NarrativeText, Text
from unstructured.chunking.basic import chunk_elements
from typing import List
import os
import weaviate
from weaviate.util import generate_uuid5
import ollama

In [None]:
../setup.sh

In [None]:
import ollama
ollama.pull("mxbai-embed-large:v1")
ollama.list()

In [None]:
#FILE_PATH = "../docs/NIST.SP.800-171r2.pdf"
FILE_PATH = "../docs/Player_s Handbook.pdf"

elements = partition_pdf(filename=FILE_PATH)

In [None]:
titles = [elem for elem in elements if elem.category == "Title"]

for title in titles:
    print(title.text)

In [None]:
import textwrap

narrative_texts = [elem for elem in elements if elem.category == "NarrativeText"]

for index, elem in enumerate(narrative_texts[:5]):
    print(f"Narrative text {index + 1}:")
    print("\n".join(textwrap.wrap(elem.text, width=70)))
    print("\n" + "-" * 70 + "\n")

In [None]:
for text in narrative_texts:
    print(text.text)

In [None]:
client = weaviate.connect_to_local()

In [None]:
from llama_index.embeddings.ollama import OllamaEmbedding

ollama_embedding = OllamaEmbedding(
    model_name="mxbai-embed-large:v1",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)


In [None]:

pass_embedding = ollama_embedding.get_text_embedding_batch(
    [narrative_texts[1],narrative_texts[50]], show_progress=True
)
print(pass_embedding)


In [None]:

query_embedding = ollama_embedding.get_query_embedding("What is a paladin?")
print(query_embedding)

In [None]:
data_objects = []
# Assuming you have extracted abstracts from the PDFs
for pdf_file in list_of_pdf_files:
    abstract = extract_abstract(pdf_file)
    data_object = {"source": pdf_file.name, "abstract": abstract}
    data_objects.append(data_object)

# Import the objects into Weaviate
client.batch.configure(batch_size=100)
with client.batch as batch:
    for data_object in data_objects:
        batch.add_object(data_object)


In [None]:
for i, d in enumerate(elements):
  response = ollama.embeddings(model="mxbai-embed-large:v1", prompt=d)

https://ollama.com/blog/embedding-models

In [None]:
pip install ollama chromadb


In [None]:
import ollama
import chromadb


In [None]:
client.delete_collection(name="docs")

In [None]:

documents = [
  "Llamas are members of the camelid family meaning they're pretty closely related to vicuñas and camels",
  "Llamas were first domesticated and used as pack animals 4,000 to 5,000 years ago in the Peruvian highlands",
  "Llamas can grow as much as 6 feet tall though the average llama between 5 feet 6 inches and 5 feet 9 inches tall",
  "Llamas weigh between 280 and 450 pounds and can carry 25 to 30 percent of their body weight",
  "Llamas are vegetarians and have very efficient digestive systems",
  "Llamas live to be about 20 years old, though some only live for 15 years and others live to be 30 years old",
]

client = chromadb.Client()
collection = client.create_collection(name="docs")

# store each document in a vector embedding database
for i, d in enumerate(documents):
  response = ollama.embeddings(model="mxbai-embed-large:v1", prompt=d)
  embedding = response["embedding"]
  collection.add(
    ids=[str(i)],
    embeddings=[embedding],
    documents=[d]
  )

In [None]:
# an example prompt
prompt = "What animals are llamas related to?"

# generate an embedding for the prompt and retrieve the most relevant doc
response = ollama.embeddings(
  prompt=prompt,
  model="mxbai-embed-large:v1"
)
results = collection.query(
  query_embeddings=[response["embedding"]],
  n_results=1
)
data = results['documents'][0][0]

In [None]:
# generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
  model="mistral",
  prompt=f"Using this data: {data}. Respond to this prompt: {prompt}"
)

print(output['response'])

Now that we've proved that this actually works, let's try to use OUR data

In [None]:
client = weaviate.connect_to_local()


In [None]:
client.collections.delete(name="PHB")

In [None]:
collection = client.collections.create(name="PHB")

In [None]:
narrative_texts = [elem for elem in elements if elem.category == "NarrativeText"]
titles = [elem for elem in elements if elem.category == "Title"]

for index, elem in enumerate(narrative_texts[:5]):
    response = ollama.embeddings(model="mxbai-embed-large:v1", prompt=elem)
    embedding = response["embedding"]
    print(".", end="", flush=True)
    
    
    