In [1]:
import os
import torch
from huggingface_hub import login

In [None]:
# Setting working directory
work_dir = "/storage/work/~~~" 
os.environ['HF_HOME'] = f"{work_dir}/.cache/huggingface"
os.environ['TRANSFORMERS_CACHE'] = f"{work_dir}/.cache/huggingface"

# Huggingface token
hf_token = "hf_~~~" 
login(token = hf_token)

In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding



In [4]:
Settings.embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en-v1.5")

### Llama-3
Settings.llm = HuggingFaceLLM(
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    tokenizer_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    context_window = 8192,
    max_new_tokens = 256,
    generate_kwargs = {"temperature": 0.1, "do_sample": False},
    device_map = "auto",
    model_kwargs = {"torch_dtype": torch.float16, "load_in_8bit": False} 
)



Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Loading documents
reader = SimpleDirectoryReader(
    input_dir=".", 
    required_exts=[".pdf", ".docx"] 
)
documents = reader.load_data()

# Creating index and engine
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [6]:
question = "What is difference between prompt sensitivity and model sensitivity?"

In [7]:
# Input (question)
print(f"Asking Question: {question}")
response = query_engine.query(question)

# Output (answer)
print("\n" + "=" * 10)
print("Response:")
print(response)
print("=" * 10)

# Source
print("Citations:")
for node in response.source_nodes:
    print(f"- Page {node.metadata['page_label']}: {node.text[:100]}...")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Asking Question: What is difference between prompt sensitivity and model sensitivity?

Response:
 Prompt sensitivity refers to the variation in metrics across different prompt templates or phrasings, whereas model sensitivity refers to the comparison of different models (e.g., GPT, Llama, Deepseek, etc.) and examining where they disagree with humans and with each other. In other words, prompt sensitivity is about the impact of different prompts on the model's performance, while model sensitivity is about the differences in performance between different models.
Citations:
- Page 8: 8/13
Step 3-Stability Evaluation:Reduce cross-prompt/model variance
Prompt Sensitivity
- Test multip...
- Page 2: 2/5
Applications- GenAI vs. Human Fact-checker (Tai et al., 2025)
Models:
- GPT-4o (OpenAI)
- Llama ...


In [8]:
# Without RAG
raw_response = Settings.llm.complete("Who is Cassandra Tai?")
print(f"[Without RAG]:\n{raw_response}\n")

# Using RAG
rag_response = query_engine.query("Who is Cassandra Tai?")
print(f"[Using RAG]:\n{rag_response}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Without RAG]:
 Cassandra Tai is a popular American social media influencer, content creator, and entrepreneur. She is known for her lifestyle, beauty, and fashion content on platforms like Instagram, TikTok, and YouTube.
Cassandra Tai was born on August 24, 1995, in the United States. She grew up in a loving family and was raised with a strong sense of values and morals. From a young age, Cassandra was fascinated by the world of beauty and fashion, and she spent hours watching makeup tutorials and fashion shows on TV.
After completing her high school education, Cassandra decided to pursue a career in the beauty and fashion industry. She started by working as a makeup artist and hairstylist, and she quickly gained a reputation for her skills and attention to detail.
In 2015, Cassandra created her Instagram account, where she began sharing her passion for beauty, fashion, and lifestyle. Her account quickly gained popularity, and she soon became known as a social media influencer. She us

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [19]:
### Pure python retrieval
doc_text = " ".join([doc.text for doc in documents])
corpus = [sent.strip() for sent in doc_text.split('.') if len(sent) > 20]

# Question
query = "What is difference between prompt sensitivity and model sensitivity?"

# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus + [query])

# Calculating cosine similarity
similarities = cosine_similarity(X[-1], X[:-1])

# Top 3 sentences
top_k = 3
top_indices = np.argsort(similarities[0])[-top_k:][::-1]

print(f"Question: {query}\n")
for idx in top_indices:
    score = similarities[0][idx]
    print(f"[score {score:.4f}] {corpus[idx]}")

Question: What is difference between prompt sensitivity and model sensitivity?

[score 0.3950] 8/13
Step 3-Stability Evaluation:Reduce cross-prompt/model variance
Prompt Sensitivity
- Test multiple prompt templates / phrasings
- Assess variation in metrics across prompts
Model sensitivity
- Compare models (GPT, Llama, Deepseek, etc
[score 0.0754] Results: GenAI has potential but is fundamentally limited in its
capacity to detect political content credibility
[score 0.0730] )
- Examine where models disagree with humans and with each other 9/13
Step 4-Explainability & Oversight: Automated systems require oversight
Use model-generated rationales as auditable artifacts
Audit for:
- Logical coherence and conceptual validity
- Biases, hallucinations, ethical red flags
Two-way humanâ€“AI collaboration
- LLM rationales expand human awareness
- Human experts correct, constrain, and document model behavior 10/13
Step 5-Uncertainty & Error Correction:Acknowledging AI uncertainties
Misclassificati

In [20]:
### Parameter tuning 

# top-k = 1
engine_k1 = index.as_query_engine(similarity_top_k=1)
response_k1 = engine_k1.query("Summarize the entire document including methodology, results, and conclusion.")
print(f"[Answer (Top-k=1)]: {response_k1}\n")

# top-k = 5
engine_k5 = index.as_query_engine(similarity_top_k=5)
response_k5 = engine_k5.query("Summarize the entire document including methodology, results, and conclusion.")
print(f"[Answer (Top-k=5)]: {response_k5}")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Answer (Top-k=1)]:  Unfortunately, the provided context information does not contain the entire document, but rather a snippet from the document. The snippet appears to be an outline or table of contents for a document titled "Evaluation Pipeline: From annotation to oversight" by Ko, Tai, and Webb Williams (2025). The outline mentions annotation, impersonating respondents, text annotation, and application cases, but does not provide any specific details about the methodology, results, or conclusion. Therefore, it is not possible to summarize the entire document based on this information. Further context or access to the full document would be necessary to provide a comprehensive summary.  If you have any additional context or would like me to assist with anything else, please let me know! 

[Answer (Top-k=5)]:  The document discusses the evaluation pipeline for large language models (LLMs) and the need for a systematic framework to validate and document their use. The authors propose 

In [18]:
### Vector Embeddings Visualization

query_str = "Social Science"
query_embedding = Settings.embed_model.get_query_embedding(query_str)

print(f"Vector dimension of question '{query_str}': {len(query_embedding)}")
print(f"First 10 elements of the vector: {query_embedding[:10]}")

# These numbers collectively represent the semantic meaning

Vector dimension of question 'Social Science': 384
First 10 elements of the vector: [0.061991237103939056, 0.046792175620794296, -0.027558419853448868, -0.035168781876564026, 0.009088116697967052, 0.008314818143844604, 0.0070055569522082806, 0.02457190304994583, 0.0032607668545097113, -0.011888917535543442]
