In [1]:
import sys
sys.path.append("../..")
from os.path import expanduser

# from langchain.llms import LlamaCpp
# from langchain_community.llms.llamacpp import LlamaCpp
from langchain.llms.llamacpp import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.output_parsers import StrOutputParser
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.llms.ctransformers import CTransformers
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_core.prompts import ChatPromptTemplate



In [2]:
# loader = PyPDFLoader("https://gcp-dev-lms.tutorify.ai/media/course_files/Syllabus-S21-AI.pdf")
# docs = loader.load_and_split()
# texts = [page.page_content for page in docs]

In [3]:
local_file = PyPDFLoader("../static/Syllabus.pdf")
docs = local_file.load_and_split()
texts = [page.page_content for page in docs]

In [4]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# GPU 

In [5]:
model_path = expanduser("~/workspace/models/llama-2-7b-chat.Q2_K.gguf")

n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
    n_ctx=3500,
    f16_kv=True,
    streaming=False
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/ehsanghaffarii/workspace/models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.att

In [6]:
rag_prompt = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question}

Context: {context}

Answer:
"""

prompt = ChatPromptTemplate.from_template(rag_prompt)


In [7]:
# rag_prompt = hub.pull("rlm/rag-prompt")
# rag_prompt.messages

In [8]:


# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
chain = (
    RunnablePassthrough.assign(context=RunnablePick("context") | format_docs)
    | prompt
    | llm
    | StrOutputParser()
)

# Run
question = "What is Reporting Bias Incidents?"

result =  chain.invoke({"context": docs, "question": question})
print(result)


Of course, I'll do my best to help you with that! Here's a concise answer to the question: What is Reporting Bias Incidents?
Reporting Bias Incidents is an expectation at the University of New Haven. It means that all members of the community are committed to creating and supporting an environment that promotes mutual respect, civility, and open-mindedness. This includes being responsible for reporting any instances of bias that may affect themselves or others in the university community. The form for reporting bias incidents can be found on the University's website at www.newhaven.edu/biasreporting. It is important to report these incidents as soon as possible so that they can be addressed and resolved in a timely manner.
In addition, it is important to note that with freedom of expression comes the responsibility to support community members' right to live and work in an environment free from harassment and fear. This means engaging in anti-bias behavior and refraining from actions t


llama_print_timings:        load time =    9150.52 ms
llama_print_timings:      sample time =     681.93 ms /   233 runs   (    2.93 ms per token,   341.68 tokens per second)
llama_print_timings: prompt eval time =   56554.83 ms /  3267 tokens (   17.31 ms per token,    57.77 tokens per second)
llama_print_timings:        eval time =   71188.74 ms /   232 runs   (  306.85 ms per token,     3.26 tokens per second)
llama_print_timings:       total time =  153842.20 ms


: 