In [None]:
"""
import os
import torch
from huggingface_hub import login
"""

In [None]:
"""
# Setting working directory
work_dir = "/storage/work/~~~" 
os.environ['HF_HOME'] = f"{work_dir}/.cache/huggingface"
os.environ['TRANSFORMERS_CACHE'] = f"{work_dir}/.cache/huggingface"

# Huggingface token
hf_token = "hf_~~~" 
login(token = hf_token)
"""

In [None]:
%pip install llama-index-embeddings-ollama

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
# from llama_index.llms.huggingface import HuggingFaceLLM
# from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

In [None]:
"""
Settings.embed_model = HuggingFaceEmbedding(model_name = "BAAI/bge-small-en-v1.5")

Settings.llm = HuggingFaceLLM(
    model_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    tokenizer_name = "meta-llama/Meta-Llama-3-8B-Instruct",
    context_window = 8192,
    max_new_tokens = 256,
    generate_kwargs = {"temperature": 0.1, "do_sample": False},
    device_map = "auto",
    model_kwargs = {"torch_dtype": torch.float16, "load_in_8bit": False} 
)
"""

Settings.embed_model = OllamaEmbedding(model_name = "llama3")

Settings.llm = Ollama(
    model = "llama3",
    request_timeout = 360.0,
    temperature = 0.1
)

In [None]:
# Loading documents
reader = SimpleDirectoryReader(
    input_dir=".", 
    required_exts=[".pdf", ".docx"] 
)
documents = reader.load_data()

# Creating index and engine
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

In [None]:
question = "What is difference between prompt sensitivity and model sensitivity?"

In [None]:
# Input (question)
print(f"Asking Question: {question}")
response = query_engine.query(question)

# Output (answer)
print("\n" + "=" * 10)
print("Response:")
print(response)
print("=" * 10)

# Source
print("Citations:")
for node in response.source_nodes:
    print(f"- Page {node.metadata['page_label']}: {node.text[:100]}...")

In [None]:
# Without RAG
raw_response = Settings.llm.complete("Who is Cassandra Tai?")
print(f"[Without RAG]:\n{raw_response}\n")

# Using RAG
rag_response = query_engine.query("Who is Cassandra Tai?")
print(f"[Using RAG]:\n{rag_response}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
### Pure python retrieval
doc_text = " ".join([doc.text for doc in documents])
corpus = [sent.strip() for sent in doc_text.split('.') if len(sent) > 20]

# Question
query = "What is difference between prompt sensitivity and model sensitivity?"

# Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus + [query])

# Calculating cosine similarity
similarities = cosine_similarity(X[-1], X[:-1])

# Top 3 sentences
top_k = 3
top_indices = np.argsort(similarities[0])[-top_k:][::-1]

print(f"Question: {query}\n")
for idx in top_indices:
    score = similarities[0][idx]
    print(f"[score {score:.4f}] {corpus[idx]}")

In [None]:
### Parameter tuning 

# top-k = 1
engine_k1 = index.as_query_engine(similarity_top_k=1)
response_k1 = engine_k1.query("Summarize the entire document including methodology, results, and conclusion.")
print(f"[Answer (Top-k=1)]: {response_k1}\n")

# top-k = 5
engine_k5 = index.as_query_engine(similarity_top_k=5)
response_k5 = engine_k5.query("Summarize the entire document including methodology, results, and conclusion.")
print(f"[Answer (Top-k=5)]: {response_k5}")

In [None]:
### Vector Embeddings Visualization

query_str = "Social Science"
query_embedding = Settings.embed_model.get_query_embedding(query_str)

print(f"Vector dimension of question '{query_str}': {len(query_embedding)}")
print(f"First 10 elements of the vector: {query_embedding[:10]}")

# These numbers collectively represent the semantic meaning