In [18]:
from langchain_community.llms import Ollama
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

 

In [19]:
model_name = "llama3"

llm = Ollama(model=model_name)
llm.invoke("Tell me a joke")

"Why couldn't the bicycle stand up by itself?\n\nBecause it was two-tired!\n\nHope that made you smile! Do you want to hear another one?"

In [20]:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
loader = UnstructuredWordDocumentLoader("../data/inst346-202401-heidenblad-v01.docx")
data = loader.load()
data


[Document(page_content='Technologies, Infrastructure and Architecture (INST346)\n\n2\t\t\t\t\t\t\t\t\t\t\tLast updated 1/24/2024\n\n1\n\nTerm: Spring 2024\n\nProfessor: Mr. Donal Heidenblad\n\nPronouns: he/him\n\nEmail: dheidenb@umd.edu \n\nOffice Hours: By Appt.\n\nhttps://calendly.com/donal-heidenblad/15min   \n\nTeaching Assistant: Shashank Ramprasad\n\nEmail: shashram@umd.edu\n\nOffice Hours: See Canvas\n\nCredits: 3\n\nCourse Dates: From Jan 25, 2024 - May 13, 2024  \n\nCourse Times: \n\nLecture:\n\nAll Sections – Tuesday 9:30 to 10:45am; SHM 2102\n\nDiscussion Sessions:\n\n0301 – Thursday 8:00 to 8:50am; HBK 0302H\n\n0302 – Thursday 9:00 to 9:50am; HBK 0302H\n\n0303 – Thursday 10:00 to 10:50am; HBK 0302H\n\nCourse Description\n\nExamines the basic concepts of computer hardware, systems software, networking, client/server architectures, cloud computing, distributed systems, and high-performance computing as applied to information rich domains. Technology and architectures will be 

In [21]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("../data/inst346-202401-heidenblad-v01.pdf")
docs = loader.load()

In [22]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs) 

vectorstore = Chroma.from_documents(documents=splits, embedding=OllamaEmbeddings(model=model_name))


In [23]:
retreiver = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retreiver_docs = retreiver.invoke("Who is the instructor for this course?")

In [24]:
retreiver_docs[0].page_content

'Additionally, students may naturally choose to use online forums for course-wide discussions (e.g., Group lists or chats) to discuss concepts in the course. However, collaboration on graded assignments is strictly prohibited unless otherwise stated. Examples of prohibited collaboration include: asking classmates for answers on quizzes or exams, asking for access codes to clicker polls, etc. Please visit\xa0the Office of Undergraduate Studies’ full list of campus-wide policies and reach out if you have questions.'

In [25]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [26]:
prompt = hub.pull("rlm/rag-prompt-llama")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt-llama', 'lc_hub_commit_hash': '693a2db5447e3b58c060a6ac02758dc7f1aaaaa4ee6214d127bf70b443158630'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> \nQuestion: {question} \nContext: {context} \nAnswer: [/INST]"))])

In [27]:
rag_chain = (
    {"context": retreiver | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [29]:
for chunk in rag_chain.stream("Who is the professor that is teaching this class?"):
    print(chunk, end="", flush=True)

[SYS]

The professor teaching this class is not explicitly mentioned in the provided context.

[/SYS]