# Install packages

In [1]:
# !pip -q install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
# bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

In [2]:
!pip -q install transformers accelerate einops langchain xformers \
bitsandbytes sentence_transformers chromadb sentence-transformers huggingface_hub torch==2.1.0 rank_bm25

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m276.5/280.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m807.5/807.5 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.2/218.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━

In [3]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
import time
from functools import wraps

def timer(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)  # Execute the function
        end_time = time.time()
        print(f"{func.__name__} took {end_time-start_time:.4f} seconds to run.")
        return result
    return wrapper

# To use this, add @timer above function definition

# Setting up LangChain and ChromaDB


In [5]:
# database
from langchain.vectorstores import Chroma
# textsplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Embeddings
import langchain_community
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings import HuggingFaceHubEmbeddings

# retriever
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# QA and model
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

# miscellany
from huggingface_hub import login

# torch
from torch import cuda, bfloat16
import torch
import transformers
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer

# Setting up Configurations and login

In [6]:
# Colab only
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
# %cd "drive/MyDrive/CMU/Junior/11711/chuangjl/raw_data"

Mounted at /content/drive


In [7]:
# Huggingface login
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Model, Tokenizer, Query Pipeline

In [61]:
model_name = "meta-llama/Llama-2-7b-chat-hf" #HuggingFaceH4/zephyr-7b-beta
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# model config
model_config = transformers.AutoConfig.from_pretrained(model_name)
# quantization config
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = bfloat16
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    config = model_config,
    quantization_config = bnb_config,
    device_map = 'auto',
)

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [62]:
# query_pipeline
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto")

llm = HuggingFacePipeline(pipeline=query_pipeline)

# Embeddings


In [63]:
# from langchain.embeddings import HuggingFaceInstructEmbeddings

# embedding = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
#                                                       model_kwargs={"device": "cuda"})
embedding_model_name = "mixedbread-ai/mxbai-embed-large-v1"
embedding_model_kwargs = {"device": device}
# embedding = HuggingFaceEmbeddings(model_name = embedding_model_name, model_kwargs = embedding_model_kwargs)
embedding = langchain_community.embeddings.huggingface.HuggingFaceEmbeddings(model_name = embedding_model_name,
                                                                             model_kwargs = embedding_model_kwargs)




# Create ChromaDB as Retriever

using BM25 to build sparse retriever

In [64]:
%cd /content/drive/MyDrive/CMU/Junior/11711/chuangjl/database

/content/drive/MyDrive/CMU/Junior/11711/chuangjl/database


In [None]:
# Load Document
loader = DirectoryLoader('/content/drive/MyDrive/CMU/Junior/11711/chuangjl/database', glob="./*.txt", loader_cls=TextLoader, )
documents = loader.load()

In [125]:
# Splitting Text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

In [None]:
persist_directory = 'db'

vectordb = Chroma.from_documents(documents = texts,
                                 embedding = embedding,
                                 persist_directory=persist_directory)
vectordb.persist()

vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

### Test Default Dense Vector Retriever

In [None]:
test_output = retriever.get_relevant_documents("what is the course 11711?")
print(len(test_output))
print("source: " + test_output[0].metadata['source'])
print(test_output[0].page_content)
print(len(test_output[0].page_content))

## Hybrid Search

In [None]:
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k = 3

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever],
                                       weights=[0.4, 0.6])

In [None]:
test_output = ensemble_retriever.get_relevant_documents("what is the course 11711?")
print(len(test_output))
print("source: " + test_output[0].metadata['source'])
print(test_output[0].page_content)
print(len(test_output[0].page_content))

## Parent Retriever or Big Chunk Retriever

In [None]:
from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
vectorstore = Chroma(collection_name = "split_parents", embedding_function = embedding) #OpenAIEmbeddings()
store = InMemoryStore()

big_chunks_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [None]:
big_chunks_retriever.add_documents(documents)

### Test Big Chunk Retriever

In [None]:
test_output = big_chunks_retriever.get_relevant_documents("what is the course 11711?")
print(len(test_output))
print("source: " + test_output[0].metadata['source'])
print(test_output[0].page_content)
print(len(test_output[0].page_content))

# Customized Prompt and Template

In [None]:
## Default LLaMA-2 prompt style
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

sys_prompt = """You are a honest assistant. You are given context about Carnegie Mellon University. You are suppose to answer questions based on context. You should only answer question ONCE and do not give unhelpful answer.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, don't share false information. """

instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""


In [None]:
# prompt_template = """
# You are a honest assistant. You are given the below context about Carnegie Mellon University.

# Context:
# {context}

# Now, answer the question based ONLY on the above context.

# The question is

# {question}
# """

In [None]:
def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [None]:

prompt_template = get_prompt(instruction)

llama_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [None]:
chain_type_kwargs = {"prompt": llama_prompt}

# Chain

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm = llm,
                                       chain_type = "stuff",
                                       retriever = big_chunks_retriever,
                                       chain_type_kwargs = chain_type_kwargs,
                                       return_source_documents = True)

# Experiment

In [None]:
## Cite sources
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

# The formatted output should include:
# The answer
# Time spent to infer
# The Source file

In [None]:
query = "Could you tell me what is the course 48205?"
llm_response = qa_chain.invoke(query)
process_llm_response(llm_response)

In [None]:
print(llm_response.keys())

## Test Model

In [None]:
from transformers import pipeline
prompt = """\
You are a honest assistant. You are given context about Carnegie Mellon University. You are suppose to answer questions based on context. Your answer should only include answers and NOTHING ELSE. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.

CONTEXT:/n/n Speech Technology for Conversational AI, (class number 11692), is a 12.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Watanabe on MW from 03:30PM to 04:50PM in GHC 5222 on Pittsburgh, Pennsylvania campus
MIIS Capstone Planning Seminar, (class number 11696), is a 6.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Brown on TBA from None to None in DNM DNM on Pittsburgh, Pennsylvania campus
MSAII Program Capstone, (class number 11699), is a 36.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Shamos on TBA from None to None in TBA on Pittsburgh, Pennsylvania campus
LTI Colloquium, (class number 11700), is a 6.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Bisk, Diaz on F from 02:00PM to 03:20PM in SH 105 on Pittsburgh, Pennsylvania campus
Advanced Natural Language Processing, (class number 11711), is a 12.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Neubig on TR from 12:30PM to 01:50PM in TEP 1403 on Pittsburgh, Pennsylvania campus
Lab in Natural Language Processing: Self-Paced, (class number 11712), is a 6.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Frederking on TBA from None to None in TBA on Pittsburgh, Pennsylvania campus
Grammar Formalisms, (class number 11722), is a 12.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Levin on MW from 02:00PM to 03:20PM in WEH 4707 on Pittsburgh, Pennsylvania campus
Linguistics Lab: Self-Paced, (class number 11723), is a 6.0-unit course offered by CMU Language Technologies Institute Department, which is taught by Levin on TBA from None to None in TBA on Pittsburgh, Pennsylvania campus /n

Question: what is course with course number 11711?
"""
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=1000)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])