In [None]:
"""
import os
import torch
from huggingface_hub import login
"""

In [None]:
"""
# Setting working directory
work_dir = "/storage/work/~~~" 
os.environ['HF_HOME'] = f"{work_dir}/.cache/huggingface"
os.environ['TRANSFORMERS_CACHE'] = f"{work_dir}/.cache/huggingface"

# Huggingface token
hf_token = "hf_~~~" 
login(token = hf_token)
"""

In [6]:
%pip install llama-index-embeddings-ollama

Collecting llama-index-embeddings-ollama
  Obtaining dependency information for llama-index-embeddings-ollama from https://files.pythonhosted.org/packages/af/53/9ab65a3d0db29f49967a292b7c33cf512e493e68cb63ef337b1fffadf489/llama_index_embeddings_ollama-0.8.6-py3-none-any.whl.metadata
  Downloading llama_index_embeddings_ollama-0.8.6-py3-none-any.whl.metadata (8.4 kB)
Collecting pytest-asyncio>=0.23.8 (from llama-index-embeddings-ollama)
  Obtaining dependency information for pytest-asyncio>=0.23.8 from https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl.metadata
  Downloading pytest_asyncio-1.3.0-py3-none-any.whl.metadata (4.1 kB)
Collecting pytest<10,>=8.2 (from pytest-asyncio>=0.23.8->llama-index-embeddings-ollama)
  Obtaining dependency information for pytest<10,>=8.2 from https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-p


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
# from llama_index.llms.huggingface import HuggingFaceLLM
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

In [None]:
"""
Settings.embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en-v1.5")

Settings.llm = HuggingFaceLLM(
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    tokenizer_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    context_window = 8192,
    max_new_tokens = 256,
    generate_kwargs = {"temperature": 0.1, "do_sample": False},
    device_map = "auto",
    model_kwargs = {"torch_dtype": torch.float16, "load_in_8bit": False} 
)
"""

Settings.embed_model = OllamaEmbedding(model_name = "llama3")

Settings.llm = Ollama(
    model = "llama3",
    request_timeout = 360.0,
    temperature = 0.1
)

In [9]:
# Loading documents
reader = SimpleDirectoryReader(
    input_dir=".", 
    required_exts=[".pdf", ".docx"] 
)
documents = reader.load_data()

# Creating index and engine
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

2026-01-28 00:23:35,841 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2026-01-28 00:23:36,403 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2026-01-28 00:23:38,876 - INFO - HTTP Request: POST http://localhost:11434/api/show "HTTP/1.1 200 OK"


In [10]:
question = "What is difference between prompt sensitivity and model sensitivity?"

In [11]:
# Input (question)
print(f"Asking Question: {question}")
response = query_engine.query(question)

# Output (answer)
print("\n" + "=" * 10)
print("Response:")
print(response)
print("=" * 10)

# Source
print("Citations:")
for node in response.source_nodes:
    print(f"- Page {node.metadata['page_label']}: {node.text[:100]}...")

Asking Question: What is difference between prompt sensitivity and model sensitivity?


2026-01-28 00:24:09,954 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2026-01-28 00:24:31,944 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"



Response:
Prompt Sensitivity refers to testing multiple prompt templates/phrasings to assess variation in metrics across prompts. This involves examining how different prompts affect the performance of a language model.

Model Sensitivity, on the other hand, involves comparing models (e.g., GPT, Llama, Deepseek) and examining where they disagree with humans and with each other. This highlights the differences in their capabilities and limitations.
Citations:
- Page 4: 4/13
Why We Need a Systematic Framework
Can an LLM do this task?
V.S.
How can we validate and docume...
- Page 8: 8/13
Step 3-Stability Evaluation:Reduce cross-prompt/model variance
Prompt Sensitivity
- Test multip...


In [14]:
# Without RAG
raw_response = Settings.llm.complete("Who is Cassandra Tai?")
print(f"[Without RAG]:\n{raw_response}\n")

# Using RAG
rag_response = query_engine.query("Who is Cassandra Tai?")
print(f"[Using RAG]:\n{rag_response}")

2026-01-28 00:27:10,332 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


[Without RAG]:
Cassandra Tai is a popular Australian YouTuber and beauty influencer. She was born on October 24, 1994, in Melbourne, Australia. Tai initially gained fame on YouTube by sharing makeup tutorials, product reviews, and lifestyle vlogs.

Over time, she expanded her content to include skincare routines, hair care tips, and even mental health discussions. Her relatable personality, honesty, and authenticity have helped her build a massive following across various social media platforms.

Cassandra Tai has collaborated with several well-known beauty brands and has been featured in prominent publications like Vogue Australia and Harper's Bazaar Australia. She is also an advocate for body positivity, self-acceptance, and mental wellness, using her platform to raise awareness about important issues.

Would you like to know more about Cassandra Tai's personal life, or perhaps some of her most popular content?



2026-01-28 00:27:16,002 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2026-01-28 00:27:23,048 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


[Using RAG]:
According to the provided information, Cassandra Tai is the author of a research paper titled "Applications- GenAI vs. Human Fact-checker" published in 2025.


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [16]:
### Pure python retrieval
doc_text = " ".join([doc.text for doc in documents])
corpus = [sent.strip() for sent in doc_text.split('.') if len(sent) > 20]

# Question
query = "What is difference between prompt sensitivity and model sensitivity?"

# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus + [query])

# Calculating cosine similarity
similarities = cosine_similarity(X[-1], X[:-1])

# Top 3 sentences
top_k = 3
top_indices = np.argsort(similarities[0])[-top_k:][::-1]

print(f"Question: {query}\n")
for idx in top_indices:
    score = similarities[0][idx]
    print(f"[score {score:.4f}] {corpus[idx]}")

Question: What is difference between prompt sensitivity and model sensitivity?

[score 0.3950] 8/13
Step 3-Stability Evaluation:Reduce cross-prompt/model variance
Prompt Sensitivity
- Test multiple prompt templates / phrasings
- Assess variation in metrics across prompts
Model sensitivity
- Compare models (GPT, Llama, Deepseek, etc
[score 0.0754] Results: GenAI has potential but is fundamentally limited in its
capacity to detect political content credibility
[score 0.0730] )
- Examine where models disagree with humans and with each other 9/13
Step 4-Explainability & Oversight: Automated systems require oversight
Use model-generated rationales as auditable artifacts
Audit for:
- Logical coherence and conceptual validity
- Biases, hallucinations, ethical red flags
Two-way humanâ€“AI collaboration
- LLM rationales expand human awareness
- Human experts correct, constrain, and document model behavior 10/13
Step 5-Uncertainty & Error Correction:Acknowledging AI uncertainties
Misclassificati

In [17]:
### Parameter tuning 

# top-k = 1
engine_k1 = index.as_query_engine(similarity_top_k=1)
response_k1 = engine_k1.query("Summarize the entire document including methodology, results, and conclusion.")
print(f"[Answer (Top-k=1)]: {response_k1}\n")

# top-k = 5
engine_k5 = index.as_query_engine(similarity_top_k=5)
response_k5 = engine_k5.query("Summarize the entire document including methodology, results, and conclusion.")
print(f"[Answer (Top-k=5)]: {response_k5}")

2026-01-28 00:28:05,130 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2026-01-28 00:28:17,959 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


[Answer (Top-k=1)]: The Conception Task Prompts Gold section outlines a framework for creating high-quality labels or clusters as gold standards. This involves two main approaches: zero-shot and few-shot learning. Zero-shot learning aims to recover gold labels without any prior training data, focusing on reliability metrics such as Cohen's kappa and Krippendorff's alpha. Few-shot learning, on the other hand, involves fine-tuning a machine learning system using limited labeled data, with validity metrics like precision, recall, and F1-score.

The Like-human section explores alternative methods for creating gold labels, including human subjects (silicon participants) who take surveys, play games, or engage in other activities to simulate human behavior. Validity metrics for this approach include mean/standard deviation relative to human baseline data and human-ness factors like fluency, cohesiveness, objectivity, readability, and more.

Overall, the document presents a comprehensive fram

2026-01-28 00:28:23,897 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"
2026-01-28 00:28:37,696 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


[Answer (Top-k=5)]: The importance of acknowledging AI uncertainties in machine learning systems is highlighted. The framework presented emphasizes the need for a systematic approach to validating and documenting LLM use to ensure credible inferences.

The methodology involves designing a dual-track system that evaluates both reliability and validity. This includes metrics such as Cohen's kappa, Krippendorff's alpha, precision, recall, F1 score, AUC, MCC, Silhouette coefficient, pair comparison, and mean/SD relative to human baseline data.

The framework also acknowledges the potential for misclassification bias in regression and descriptive analyses when treating AI labels as ground truth. To address this, methods such as Design-based Supervised Learning (DSL) and Misclassification / maximum-likelihood adjustment (MLA) can be employed.

In terms of applications, the framework is demonstrated through a case study comparing GenAI with human fact-checkers. The results show that GenAI can

In [18]:
### Vector Embeddings Visualization

query_str = "Social Science"
query_embedding = Settings.embed_model.get_query_embedding(query_str)

print(f"Vector dimension of question '{query_str}': {len(query_embedding)}")
print(f"First 10 elements of the vector: {query_embedding[:10]}")

# These numbers collectively represent the semantic meaning

2026-01-28 00:29:12,687 - INFO - HTTP Request: POST http://localhost:11434/api/embed "HTTP/1.1 200 OK"


Vector dimension of question 'Social Science': 4096
First 10 elements of the vector: [-0.016794328, -0.006202411, 0.017395807, 0.0011353239, -0.0016906768, 0.01175532, -0.019479135, 0.017811432, -0.004520285, 0.017569028]
