In [None]:
from pathlib import Path

PARENT_PATH = Path.cwd().parent
OUTPUT_PATH = PARENT_PATH / 'data' / 'processed' / 'pmdg'

In [None]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient

load_dotenv(override=True)

OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
MONGO_URI = os.environ["MONGO_URI"]
DB_NAME = "GAIA"
COLLECTION_NAME = "PMDG"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"


EMBEDDING_FIELD_NAME = "embedding"
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch

docs = [10]
for i in docs:
    loader = TextLoader(OUTPUT_PATH / (str(i) + '.txt'))
    documents = loader.load()

    #print(documents)
    #text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    #docs = text_splitter.split_documents(documents)

    #documents[0].metadata = { 'source':"1234" }

    # insert the documents in MongoDB Atlas Vector Search
    x = MongoDBAtlasVectorSearch.from_documents(
        documents=documents,
        embedding=OpenAIEmbeddings(disallowed_special=()),
        collection=collection, 
        index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch

vector_search = MongoDBAtlasVectorSearch.from_connection_string(
   MONGO_URI,
   DB_NAME + "." + COLLECTION_NAME,
   OpenAIEmbeddings(disallowed_special=()),
   index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME
)
query = "Ageratum conyzoides"
results = vector_search.similarity_search(
   query=query,
   k=20,
)

print(results)

for result in results:
   print( result)

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

qa_retriever = vector_search.as_retriever(
   search_type="similarity",
   search_kwargs={
       "k": 5,
       "post_filter_pipeline": [{"$limit": 25}]
   }
)

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
"""
PROMPT = PromptTemplate(
   template=prompt_template, input_variables=["context", "question"]
)


qa = RetrievalQA.from_chain_type(llm=OpenAI(),chain_type="stuff", retriever=qa_retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT})
docs = qa({"query": "Suggest biocontrols if my rice crop suffers from Meloidogyne graminicola"})


print(docs["result"])
print(docs['source_documents'])