Using https://learnbybuilding.ai/tutorials/rag-from-scratch as a tutorial

In [None]:
""" Baby's First RAG """

corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]

def jaccard_similarity(query, document) -> float:
    """ Pre-processes plain strings into a set to perform comparisons.

    Args:
        query (_type_): _description_
        document (_type_): _description_

    Returns:
        float: _description_
    """
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def return_response(query, corpus) -> str:
    """ Selects the best document to return to the user

    Args:
        query (_type_): _description_
        corpus (_type_): _description_

    Returns:
        str: _description_
    """
    similarities = []
    for doc in corpus:
        similarity = jaccard_similarity(user_input, doc)
        similarities.append(similarity)
    return corpus_of_documents[similarities.index(max(similarities))]



In [None]:
user_prompt = "What is a leisure activity that you like?"
user_input = "I like to be with friends"
return_response(user_input, corpus_of_documents)


In [None]:
import requests
import json
import ollama
ollama.pull('mistral')


In [None]:

user_input = "I like to see my friends"
relevant_document = return_response(user_input, corpus_of_documents)
full_response = []
# https://github.com/jmorganca/ollama/blob/main/docs/api.md
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""

url = 'http://localhost:11434/api/generate'
data = {
    "model": "mistral:latest",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}


In [None]:
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)


In [None]:
try:
    count = 0
    for line in response.iter_lines():
        # filter out keep-alive new lines
        # count += 1
        # if count % 5== 0:
        #     print(decoded_line['response']) # print every fifth token
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            if 'response' in decoded_line:
                full_response.append(decoded_line['response'])
            else:
                print("Warning: 'response' key not found in the data:", decoded_line)
finally:
    response.close()
print(''.join(full_response))

In [None]:
user_input = "I don't like to hike"
relevant_document = return_response(user_input, corpus_of_documents)
# https://github.com/jmorganca/ollama/blob/main/docs/api.md
full_response = []
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
This is the recommended activity: {relevant_document}
The user input is: {user_input}
Compile a recommendation to the user based on the recommended activity and the user input.
"""
url = 'http://localhost:11434/api/generate'
data = {
    "model": "mistral:latest",
    "prompt": prompt.format(user_input=user_input, relevant_document=relevant_document)
}
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
try:
    for line in response.iter_lines():
        # filter out keep-alive new lines
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            # print(decoded_line['response'])  # uncomment to results, token by token
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

This ends the tutorial at https://learnbybuilding.ai/tutorials/rag-from-scratch 
Now to go into https://learnbybuilding.ai/tutorials/rag-from-scratch-part-2-semantics-and-cosine-similarity

In [None]:
corpus_of_documents = [
    "Take a leisurely walk in the park and enjoy the fresh air.",
    "Visit a local museum and discover something new.",
    "Attend a live music concert and feel the rhythm.",
    "Go for a hike and admire the natural scenery.",
    "Have a picnic with friends and share some laughs.",
    "Explore a new cuisine by dining at an ethnic restaurant.",
    "Take a yoga class and stretch your body and mind.",
    "Join a local sports league and enjoy some friendly competition.",
    "Attend a workshop or lecture on a topic you're interested in.",
    "Visit an amusement park and ride the roller coasters."
]


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
doc_embeddings = model.encode(corpus_of_documents)

In [None]:
print(doc_embeddings)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
query = "What's the best activity to do with friends?"
query_embedding = model.encode([query])
similarities = cosine_similarity(query_embedding, doc_embeddings)
print(similarities)

In [None]:
indexed = list(enumerate(similarities[0]))
sorted_index = sorted(indexed, key=lambda x: x[1], reverse=True)
print(sorted_index)

In [None]:
recommended_documents = []
for value, score in sorted_index:
    formatted_score = "{:.2f}".format(score)
    print(f"{formatted_score} => {corpus_of_documents[value]}")
    if score > 0.3:
        recommended_documents.append(corpus_of_documents[value])

In [None]:
prompt = """
You are a bot that makes recommendations for activities. You answer in very short sentences and do not include extra information.
These are potential activities:
{recommended_activities}
The user's query is: {user_input}
Provide the user with 2 recommended activities based on their query.
"""
recommended_activities = "\n".join(recommended_documents)
user_input = "I like to spend time with my friends"
full_prompt = prompt.format(user_input=user_input, recommended_activities=recommended_activities)
url = 'http://localhost:11434/api/generate'
data = {
    "model": "mistral:latest",
    "prompt": full_prompt
}
headers = {'Content-Type': 'application/json'}
response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)
full_response=[]
try:
    count = 0
    for line in response.iter_lines():
        #filter out keep-alive new lines
        # count += 1
        # if count % 5== 0:
        #     print(decoded_line['response']) # print every fifth token
        if line:
            decoded_line = json.loads(line.decode('utf-8'))
            
            full_response.append(decoded_line['response'])
finally:
    response.close()
print(''.join(full_response))

In [None]:
from unstructured.ingest.connector.local import SimpleLocalConfig
from unstructured.ingest.connector.weaviate import (
    SimpleWeaviateConfig,
    WeaviateAccessConfig,
    WeaviateWriteConfig,
)
from unstructured.ingest.interfaces import (
    ChunkingConfig,
    EmbeddingConfig,
    PartitionConfig,
    ProcessorConfig,
    ReadConfig,
)
from unstructured.ingest.runner import LocalRunner
from unstructured.ingest.runner.writers.base_writer import Writer
from unstructured.ingest.runner.writers.weaviate import (
    WeaviateWriter,
)


def get_writer() -> Writer:
    return WeaviateWriter(
        connector_config=SimpleWeaviateConfig(
            access_config=WeaviateAccessConfig(),
            host_url="http://localhost:8080",
            class_name="elements",
        ),
        write_config=WeaviateWriteConfig(),
    )


if __name__ == "__main__":
    writer = get_writer()
    runner = LocalRunner(
        processor_config=ProcessorConfig(
            verbose=True,
            output_dir="docs",
            num_processes=2,
        ),
        connector_config=SimpleLocalConfig(
            input_path="docs/Player_s Handbook.pdf",
        ),
        read_config=ReadConfig(),
        partition_config=PartitionConfig(),
        chunking_config=ChunkingConfig(chunk_elements=True),
        embedding_config=EmbeddingConfig(
            provider="langchain-huggingface",
        ),
        writer=writer,
        writer_kwargs={},
    )
    runner.run()

## Trying to isolate embedding before putting into Weaviate

In [1]:
pip install -r ../requirements.in

Defaulting to user installation because normal site-packages is not writeable
Collecting gradio (from -r ../requirements.in (line 1))
  Downloading gradio-4.27.0-py3-none-any.whl.metadata (15 kB)
Collecting ollama (from -r ../requirements.in (line 2))
  Downloading ollama-0.1.8-py3-none-any.whl.metadata (3.8 kB)
Collecting loguru (from -r ../requirements.in (line 3))
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting sentence-transformers (from -r ../requirements.in (line 4))
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting torch (from -r ../requirements.in (line 5))
  Downloading torch-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision (from -r ../requirements.in (line 6))
  Downloading torchvision-0.17.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio (from -r ../requirements.in (line 7))
  Downloading torchaudio-2.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (6.4 kB)
C

In [1]:
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Title, NarrativeText, Text
from unstructured.chunking.basic import chunk_elements
from typing import List
import os
import weaviate
from weaviate.util import generate_uuid5
import ollama

ImportError: libGL.so.1: cannot open shared object file: No such file or directory

In [2]:
../setup.sh

SyntaxError: invalid syntax (1864167237.py, line 1)

In [None]:
import ollama
ollama.pull("mxbai-embed-large:v1")

In [None]:
FILE_PATH = "../docs/NIST.SP.800-171r2.pdf"

def process_pdf(file_path: str):
    # partition the pdf
    elements = partition_pdf(filename=file_path, strategy="fast")
    # convert elements into strings
    texts = [str(el) for el in elements]
    return texts

docs = process_pdf(FILE_PATH)