In [12]:
import os
import re
import openai
import xml.etree.ElementTree as ET
import requests
import json
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

import chromadb
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [13]:
PARENT_PATH = Path.cwd().parent
if 'publishingchatgptpocweb' not in str(PARENT_PATH):
    PARENT_PATH = PARENT_PATH / 'publishingchatgptpocweb'

DATA_DIRECTORY = PARENT_PATH / 'data'
MODEL_DIRECTORY = PARENT_PATH / 'models'

JATS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'raw'
ARTICLES_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'articles'
CONCEPTS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'concepts'
CHROMA_DB_PUB = MODEL_DIRECTORY / 'langchain_chroma_db_pub'

In [14]:
OPEN_AI_API_SECRET = os.getenv('Open__AI__API__Secret')
openai.api_key = OPEN_AI_API_SECRET

In [19]:
def update_article_metadata(document):
    # Extracting various details from the page_content
    title_match = re.search(r'Title:\s*(.*?)(?:\n|$)', document.page_content)
    pan_match = re.search(r'PAN:\s*(.*?)(?:\n|$)', document.page_content)
    source_match = re.search(r'Article Link/URL/Source:\s*(.*?)(?:\n|$)', document.page_content)
    pub_date = re.search(r'Publishing Date:\s*(.*?)(?:\n|$)', document.page_content)    
    isbn_match = re.search(r'ISBN:\s*(.*?)(?:\n|$)', document.page_content)
    day_match = re.search(r'Day:\s*(\d{1,2})(?:\n|$)', document.page_content)
    month_match = re.search(r'Month:\s*(\d{1,2})(?:\n|$)', document.page_content)
    year_match = re.search(r'Year:\s*(\d{4})(?:\n|$)', document.page_content)

    document.metadata['document_type'] = 'article'
    # Updating metadata dictionary
    if title_match:
        document.metadata['title'] = title_match.group(1)
    if pan_match:
        document.metadata['pan'] = pan_match.group(1)
    if source_match:
        document.metadata['source'] = source_match.group(1)
    if isbn_match:
        document.metadata['isbn'] = isbn_match.group(1)
    if day_match and month_match and year_match:
        pub_date = f"{year_match.group(1)}-{month_match.group(1).zfill(2)}-{day_match.group(1).zfill(2)}"
        document.metadata['publishing_date'] = pub_date   
    
    return document

def update_concepts_metadata(document):
    # Extracting various details from the page_content
    document.metadata['document_type'] = 'concept'
    concept_section = re.search(r'Thesaurus Concept:\n\s*Concept:\n(.*?)\n(?:\s*Broader Concept:|\s*Narrower Concepts:|\s*Related Concepts:|\Z)', document.page_content, re.DOTALL)
    if concept_section:
        concept_details = concept_section.group(1)
        name_match = re.search(r'name:\s*(.*?)(?:\n|$)', concept_details)
        uri_match = re.search(r'uri:\s*(.*?)(?:\n|$)', concept_details)
        # Updating metadata dictionary
        if name_match:
            document.metadata['name'] = name_match.group(1)
            document.metadata['title'] = name_match.group(1)
        if uri_match:
            document.metadata['uri'] = uri_match.group(1)
            document.metadata['source'] = uri_match.group(1)        
    return document   


def add_document_to_chroma_db(data_directory, type='article'):
    for root_dir, sub_dirs, files in os.walk(data_directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                try:
                    text_file_path = os.path.join(root_dir, file_name)
                    loader = TextLoader(text_file_path, encoding='utf8')
                    documents = loader.load()
                    if type=='articles':
                        documents[0] = update_article_metadata(documents[0])
                    elif type=='concepts':
                        documents[0] = update_concepts_metadata(documents[0])
                    instance.add_documents(documents)
                except:
                    print('error adding document', file_name)
    return instance

def create_vector_data_from_processed_documents():    
    embeddings = OpenAIEmbeddings(openai_api_key=OPEN_AI_API_SECRET)
    instance = Chroma(embedding_function=embeddings, persist_directory=str(CHROMA_DB_PUB))
    instance = add_document_to_chroma_db(ARTICLES_DATA_DIRECTORY_PATH, 'articles')
    instance = add_document_to_chroma_db(CONCEPTS_DATA_DIRECTORY_PATH, 'concepts')
    instance.persist()
    instance = None

In [None]:
create_vector_data_from_processed_documents()

## Test Chroma DB Approach

In [None]:
def load_qa(model='gpt-3.5-turbo'):
    instance = Chroma(persist_directory=str(CHROMA_DB_PUB), embedding_function=OpenAIEmbeddings(openai_api_key=OPEN_AI_API_SECRET))

    TEMPLATE = """As EVA (Expert Virtual Assistance), your role is to help user with their queries related to articles, concepts etc. Please adhere to the following guidelines:
    Utilize the given context (enclosed by <ctx></ctx>) to construct your responses:
    ------
    <ctx>
    {context}
    </ctx>
    ------
    
    Q: {question}
    A: """
    
    prompt_template = PromptTemplate(
                template=TEMPLATE, 
                input_variables=["context", "question"])        


    llm = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        streaming=True,
        callbacks=[StreamingStdOutCallbackHandler()],
        temperature=0,
        openai_api_key=OPEN_AI_API_SECRET,
    )
    
    retieval_qa = RetrievalQA.from_chain_type(
                llm=llm,
                retriever=instance.as_retriever(search_kwargs={"k": 1}),
                chain_type_kwargs={
                    "verbose": False,
                    "prompt": prompt_template
                }
            )

    return retieval_qa

def get_response(qa, request):
    response = qa.run(text)
    return response

model = 'gpt-3.5-turbo'
# model = 'gpt-3.5-turbo-16k'
# model = 'gpt-4'
qa = load_qa(model)

In [None]:
# text='Can you tell me a more about the Tropical Timber Market Report?'
# text='Tell me the articles names and pan where CABI-keyword preferredTerm is world markets?'
# text='When was this article related to Tropical Timber Market Report published? and who wrote it in terms of authors?'
text='who wrote The ebola epidemic in West Africa: proceedings of a workshop ?'

get_response(qa, text)