In [108]:
import torch
import ollama

from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.embeddings.base import Embeddings
import glob
from typing import List

In [122]:
class OllamaEmbeddings(Embeddings):
    def __init__(self, model: str = 'mxbai-embed-large'):
        self.model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [ollama.embeddings(model=self.model, prompt=text)['embedding'] for text in texts]

    def embed_query(self, text: str) -> List[float]:
        return ollama.embeddings(model=self.model, prompt=text)['embedding']
    
def get_relevant_context(query: str, vector_db: Chroma, top_k: int = 3, threshold: float = 0.7) -> str:
    query_embedding = torch.tensor(OllamaEmbeddings().embed_query(query))
    
    all_embeddings = vector_db._collection.get(include=['embeddings', 'documents'])
    embeddings = torch.tensor(all_embeddings['embeddings'])
    documents = all_embeddings['documents']

    cos_scores = torch.nn.functional.cosine_similarity(query_embedding.unsqueeze(0), embeddings)
    
    filtered_scores = cos_scores[cos_scores >= threshold]
    if len(filtered_scores) == 0:
        return "No relevant documents found."

    top_k = min(top_k, len(filtered_scores))
    
    top_indices = torch.topk(filtered_scores, k=top_k)[1].tolist()
    
    res = ""
    for i, idx in enumerate(top_indices):
        res += f"{i+1}. {documents[idx]},\n\n"
    
    return res

In [123]:
import os
os.getcwd()

'/Users/kunkerdthaisong/Llamalama_II'

In [124]:
all_chunks=[]
for i in glob.glob("/Users/kunkerdthaisong/Llamalama_II/*.pdf"):
    print(i)
    loader = UnstructuredPDFLoader(file_path=i)
    data = loader.load()
    print(data)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    chunks=text_splitter.split_documents(data)
    all_chunks.extend(chunks)

/Users/kunkerdthaisong/Llamalama_II/Grammar_Cheatsheet.pdf
[Document(metadata={'source': '/Users/kunkerdthaisong/Llamalama_II/Grammar_Cheatsheet.pdf'}, page_content="Grammar Cheatsheet\n\nWho or Whom? Data or datum? And why do some people freak out when they see “15 items or less” at the grocery store?\n\nIf you’re writing a paper, getting grammar right is really helpful. For one thing, it will protect you from gangs of Grammar Nazis. But more important, it’ll make you a better writer. Your readers will appreciate it, even if they aren’t conscious of why.\n\nThis guide will cover those pesky words that we all get confused about from time to time. It will show you the right word to use when you’re clacking away at the keyboard. And above all, it will make your writing clearer.\n\nJust One Small Caveat\n\nThe purpose of grammar is not to be 100%, absolutely correct. It’s to make your writing easier to understand. And there are few things in grammar where everyone agrees anyway. What’s mo

In [125]:
all_chunks

[Document(metadata={'source': '/Users/kunkerdthaisong/Llamalama_II/Grammar_Cheatsheet.pdf'}, page_content='Grammar Cheatsheet\n\nWho or Whom? Data or datum? And why do some people freak out when they see “15 items or less” at the grocery store?\n\nIf you’re writing a paper, getting grammar right is really helpful. For one thing, it will protect you from gangs of Grammar Nazis. But more important, it’ll make you a better writer. Your readers will appreciate it, even if they aren’t conscious of why.\n\nThis guide will cover those pesky words that we all get confused about from time to time. It will show you the right word to use when you’re clacking away at the keyboard. And above all, it will make your writing clearer.\n\nJust One Small Caveat'),
 Document(metadata={'source': '/Users/kunkerdthaisong/Llamalama_II/Grammar_Cheatsheet.pdf'}, page_content='Just One Small Caveat\n\nThe purpose of grammar is not to be 100%, absolutely correct. It’s to make your writing easier to understand. An

In [126]:
embedding_function = OllamaEmbeddings(model='mxbai-embed-large')

vector_db = Chroma.from_documents(
    documents=all_chunks,
    embedding=embedding_function,
    collection_name="local-rag"
)

In [127]:
print(vector_db)

<langchain_community.vectorstores.chroma.Chroma object at 0x31d2f0920>


In [131]:
query = "too and to"
relevant_docs = get_relevant_context(query, vector_db)
print(relevant_docs)

No relevant documents found.


In [7]:
def get_llm_response(prompt, system_prompt="You are a helpful assistant. who help me study english and answer in briefly."):
    searched=get_relevant_context(prompt,vector_db)
    response = ollama.chat(model='phi3', messages=[
        {
            'role': 'system',
            'content': system_prompt,
        },
        {
            'role': 'user',
            'content': prompt+f",Here the User's querry (unnecessary): {searched},",
        }
    ])
    return response['message']['content']

In [10]:
get_llm_response("what can you do for help me study english")

'As your English assistant:\n\n1. I can help ensure you write error-free sentences by checking for correct grammar and sentence structure when composing papers or essays in English. This will enhance the overall quality of your writing. However, it\'s essential to understand that while avoiding grammatical mistakes is beneficial, developing a deeper understanding of language mechanics can truly improve as you write consistently.\n\n2. I am equipped with guides and tools designed to clarify commonly confused words in English—like homophones or frequently misused terms. This information will assist you while typing on your keyboard and prevent errors related to word choice, ultimately improving the clarity of your writing. Remember that practicing using these words correctly is crucial for mastery over time.\n\n3. Regarding "advice" versus "advise," I can provide examples of their correct usage in sentences or offer exercises aimed at helping you distinguish between when to use each word

In [121]:
vector_db.delete_collection()