In [1]:
%pwd

'/data/01_ByMember/wgchoi/minerva_chatbot/Code'

In [None]:
%%writefile requirements.txt
langchain
langchain-community
llama-parse
fastembed
chromadb
python-dotenv
langchain-groq
chainlit
fastembed
unstructured[md]
python-dotenv


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

llamaparse_api_key = os.getenv('LLAMA_CLOUD_API_KEY') #* https://cloud.llamaindex.ai/api-key
groq_api_key = os.getenv('GROQ_API_KEY') #* https://console.groq.com/docs/quickstart

In [3]:
##### LLAMAPARSE #####
from llama_parse import LlamaParse

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
#
from groq import Groq
from langchain_groq import ChatGroq
#
import joblib
import os
import nest_asyncio  # noqa: E402
nest_asyncio.apply()

In [4]:
def get_html_file_list(*file_dirs):
    html_files = []

    # Function to list HTML files in a directory
    def list_html_files(directory):
        for filename in os.listdir(directory):
            if filename.endswith('.html'):
                html_files.append(os.path.join(directory, filename))

    # List HTML files from each provided directory
    for directory in file_dirs:
        list_html_files(directory)

    return html_files

In [26]:
def load_or_parse_data():
    data_file = "./Data/parsed_data_240529_cms.pkl" ##TODO

    if os.path.exists(data_file):
        # Load the parsed data from the file
        parsed_data = joblib.load(data_file)
    else:
        
        docs = get_html_file_list('Data/scrapped_html/sitemap-cms') ##TODO
    
        # Perform the parsing step and store the result in llama_parse_documents
        parsingInstructionUber10k = """The provided document contains information about Minerva University.
        This form provides detailed information about Minerva University and their educational programs, and stories.\
        It contains many tables and figures.
        Try to be precise while answering the questions"""
        parser = LlamaParse(api_key=llamaparse_api_key,
                            result_type="markdown",
                            parsing_instruction=parsingInstructionUber10k,
                            max_timeout=5000,)
        llama_parse_documents = parser.load_data(docs)


        # Save the parsed data to a file
        print("Saving the parse results in .pkl format ..........")
        joblib.dump(llama_parse_documents, data_file)

        # Set the parsed data to the variable
        parsed_data = llama_parse_documents

    return parsed_data

In [27]:
# Create vector database
def create_vector_database():
    """
    *Creates a vector database using document loaders and embeddings.

    *This function loads urls,
    *splits the loaded documents into chunks, transforms them into embeddings using OllamaEmbeddings,
    *and finally persists the embeddings into a Chroma vector database.

    """
    db_directory = "Data/test-db-240529-cms"
    os.makedirs(db_directory, exist_ok =True)
    os.chmod(db_directory, 0o755)



    # Call the function to either load or parse the data
    llama_parse_documents = load_or_parse_data()
    print(llama_parse_documents[0].text[:300])

    with open('Data/test-db-240529-cms/output.md', 'a') as f:  #TODO
        for doc in llama_parse_documents:
            f.write(doc.text + '\n')

    markdown_path = "Data/test-db-240529-cms/output.md" ##TODO
    loader = UnstructuredMarkdownLoader(markdown_path)

   #loader = DirectoryLoader('data/', glob="**/*.md", show_progress=True)
    documents = loader.load()
    # Split loaded documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)

    #len(docs)
    print(f"length of documents loaded: {len(documents)}")
    print(f"total number of document chunks generated :{len(docs)}")
    #docs[0]

    # Initialize Embeddings
    embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

    # Create and persist a Chroma vector database from the chunked documents
    vs = Chroma.from_documents(
        documents=docs,
        embedding=embed_model,
        persist_directory=db_directory,  # Local mode with in-memory storage only
        collection_name="rag"
    )

    #query it
    #query = "what is the agend of Financial Statements for 2022 ?"
    #found_doc = qdrant.similarity_search(query, k=3)
    #print(found_doc[0][:100])
    #print(qdrant.get())

    print('Vector DB created successfully !')
    return vs,embed_model

In [28]:
vs,embed_model = create_vector_database()

Parsing files:   0%|          | 0/426 [00:00<?, ?it/s]

Parsing files: 100%|██████████| 426/426 [14:40<00:00,  2.07s/it]


Saving the parse results in .pkl format ..........
#

# Minerva University

# Academics

# Admissions

# About

# Stories

# Events

# MINERVA VOICES

Connecting to the Classroom: A Minerva Faculty Perspective

by Rohan Shekhar, Ph.D., Professor of Computational Sciences

May 4, 2018
---
#

# Minerva University Information

# About Minerva Universit
length of documents loaded: 1
total number of document chunks generated :1385


Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 71089.90it/s]


Vector DB created successfully !


# Chatting

In [12]:
# 240528 - merging cms + static

vectorstore_1 = Chroma(embedding_function=FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5"),
                      persist_directory="Data/test-db-240529-cms-and-static",
                      collection_name="rag")

vectorstore_2 = Chroma(embedding_function=FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5"),
                      persist_directory="Data/test-db-240529-static",
                      collection_name="rag")

vectorstore_2_data = vectorstore_2._collection.get(include=['documents','metadatas','embeddings'])

vectorstore_1._collection.add(
     embeddings=vectorstore_2_data['embeddings'],
     metadatas=vectorstore_2_data['metadatas'],
     documents=vectorstore_2_data['documents'],
     ids=vectorstore_2_data['ids']
)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 19803.14it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 63743.22it/s]


In [13]:
chat_model = ChatGroq(temperature=0.1,
                      model_name="llama3-8b-8192",
                      api_key=os.getenv('GROQ_API_KEY'),)
vectorstore = Chroma(embedding_function=FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5"),
                      persist_directory="Data/test-db-240529-cms-and-static",
                      collection_name="rag")
 
retriever=vectorstore.as_retriever(search_kwargs={'k': 3})

custom_prompt_template = """You are an tutor who have to give information about Minerva University to students who are curious about Minerva University.

This interaction concludes with your one reply, as a markdown format.

Use the following pieces of information to answer the student's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Also, please add that your response is based on the user provided information, and it is essential to consult with a qualified veterinarian.
Helpful answer:
"""

def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question'])
    return prompt
#
prompt = set_custom_prompt()
prompt

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 22501.63it/s]


PromptTemplate(input_variables=['context', 'question'], template="You are an tutor who have to give information about Minerva University to students who are curious about Minerva University.\n\nThis interaction concludes with your one reply, as a markdown format.\n\nUse the following pieces of information to answer the student's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nContext: {context}\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nAlso, please add that your response is based on the user provided information, and it is essential to consult with a qualified veterinarian.\nHelpful answer:\n")

In [14]:
qa = RetrievalQA.from_chain_type(llm=chat_model,
                               chain_type="stuff",
                               retriever=retriever,
                               return_source_documents=True,
                               chain_type_kwargs={"prompt": prompt})

In [18]:
query = """Who is the president of Minerva University?"""

response = qa.invoke({"query": query})

In [19]:
response['result']

'Based on the provided information, the President of Minerva University is Mike Magee.'

In [20]:
response['source_documents']

[Document(page_content='Contact: info@minerva.edu\n\nMinerva University\n\nAbout Minerva University\n\nMinerva University is an independent, non-profit educational institution accredited by Western Senior Colleges and Universities Commission (WASC). The Minerva name, logo, and trade dress are trademarks of Minerva Project.\n\n© 2024. Minerva Project, Inc. All Rights Reserved.\n\nMinerva University\n\nAcademics\n\nAdmissions\n\nAbout\n\nStories\n\nEvents\n\nMINERVA VOICES\n\nA Letter from Teri Cannon, Minerva University Founding President\n\nLetter from the Founding President\n\nApril 20, 2022\n\nMinerva University\n\nWelcome to Minerva University\n\nThis form provides detailed information about Minerva University and their educational programs, and stories. It contains many tables and figures.\n\nMessage from the President\n\nDear Friends of Minerva University,\n\nAs we look to build on our remarkable progress and expand access to our innovative education model to help more thinkers, l

In [24]:
from telegram import Update, Bot
from telegram.ext import Application, CommandHandler, MessageHandler, filters, CallbackContext


TELEGRAM_TOKEN = '7231454526:AAH8ZAqt9USSWa2u6ojJHEKFeJYNRkovR2U'

async def start(update: Update, context: CallbackContext) -> None:
    await update.message.reply_text('Hi! Ask me anything about Minerva University.')

async def handle_message(update: Update, context: CallbackContext) -> None:
    query = update.message.text
    response = qa.invoke({"query": query})
    await update.message.reply_text(response['result'])

def main() -> None:
    application = Application.builder().token(TELEGRAM_TOKEN).build()

    application.add_handler(CommandHandler("start", start))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    application.run_polling()

if __name__ == '__main__':
    main()

RuntimeError: Cannot close a running event loop