<a href="https://colab.research.google.com/github/fayaz1420/Github/blob/master/HandBook_LLM_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install -U transformers
%pip install -U accelerate
%pip install -U bitsandbytes
%pip install -U sentence-transformers
%pip install -U langchain-community
%pip install -U chromadb

Collecting transformers
  Downloading transformers-5.1.0-py3-none-any.whl.metadata (31 kB)
Downloading transformers-5.1.0-py3-none-any.whl (10.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 5.0.0
    Uninstalling transformers-5.0.0:
      Successfully uninstalled transformers-5.0.0
Successfully installed transformers-5.1.0
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1
Collecting langchain-community
  Downloading langchain_community-

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
%pip install -qU pypdf

In [None]:
from langchain_community.document_loaders import PyPDFLoader

remove_pages = [1,2,3,4,5,6,7,8,9]
pages = []
loader = PyPDFLoader("phd-ie-program-handbook-2025.pdf")
for i, page in enumerate(loader.lazy_load(), start=1):
    if i not in remove_pages:
        pages.append(page)

In [None]:
print(f"{pages[0].metadata}\n")
print(pages[0].page_content)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
splits = text_splitter.split_documents(pages)

In [None]:
print(splits[0])
print(splits[1])
print(splits[2])

In [None]:
print(len(splits))

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
persist_directory = "phd_handbook_chromadb"
vector_db = Chroma.from_documents(splits, embedding=embeddings, persist_directory=persist_directory)
vector_db.persist()
print(f"Stored embeddings in ChromaDB at '{persist_directory}'")

In [None]:
%pip install --upgrade langchain langchain-core

In [None]:
from transformers import pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

llm_pipeline = pipeline(
   "text-generation",
    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    torch_dtype="auto",
    device_map="auto",
    max_new_tokens=256,
    temperature=0.1,
    do_sample=False,
    return_full_text=False,
    eos_token_id=2
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

retriever = vector_db.as_retriever(search_kwargs={"k": 2})

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])



In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a concise expert assistant. Only answer user questions directly using the provided context."),
    ("human", "Context:\n{context}\n\nQuestion:\n{question}\n\nAnswer:")
])

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

response = rag_chain.invoke("What are the graduation requirements for the PhD in Interdisciplinary Engineering?")
print(response)

In [None]:
response = rag_chain.invoke("How many credit hours are required for the degree?")
print(response)

In [None]:
response = rag_chain.invoke("What are the graduation requirements?")
print(response)

In [None]:
response = rag_chain.invoke("What is the stipend for a Graduate Research Assistant?")
print(response)

In [None]:
response = rag_chain.invoke("How many milestones should i complete before dissertation")
print(response)

In [None]:
response = rag_chain.invoke("How many milestones should i complete before dissertation and what are they?")
print(response)

In [None]:
response = rag_chain.invoke("What are the GPA requirements to stay in the program?")
print(response)

In [None]:
def output_test(question):

    docs = vector_db.similarity_search(question, k=3)
    print(docs)
    # Step 2: Format retrieved text
    context_text = "\n\n".join([d.page_content for d in docs])
    print(context_text)
    messages = [
        {
            "role": "system",
            "content": (
                "You are an academic assistant that answers questions "
                "about the PhD in Interdisciplinary Engineering program using only the given context. "
                "If the context does not contain the answer, say 'The document does not contain this information.'"
            ),
        },
        {
            "role": "user",
            "content": f"Context:\n{context_text}\n\nQuestion:\n{question}\n\nAnswer:",
        },
    ]
    prompt = llm.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    outputs = llm(
        prompt,
        max_new_tokens=256,
        do_sample=False,
        temperature=0.1,
    )
    answer = outputs[0]["generated_text"][len(prompt):].strip()
    print(answer)
    return answer

In [None]:
response = output_test("What are the GPA requirements to stay in the program?")
print(response)

In [None]:
import nbformat

# Mount Google Drive if notebook is there
from google.colab import drive
drive.mount('/content/drive')

notebook_path = '/content/drive/MyDrive/ColaNotebooks/HandBook_LLM.ipynb'
nb = nbformat.read(notebook_path, as_version=4)

# Remove broken widgets metadata
if "widgets" in nb.metadata:
    del nb.metadata["widgets"]

# Save the fixed notebook
fixed_path = '/content/drive/MyDrive/HandBook_LLM_RAG.ipynb'
nbformat.write(nb, fixed_path)
print("Notebook fixed and saved to:", fixed_path)
