In [26]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

In [27]:
reader = SimpleDirectoryReader(input_dir="~/pdev/yaub/frontend/simple-yaub/public/", recursive=True)
documents_all = reader.load_data()
documents = list()
for doc in documents_all:
    if "text" in doc.metadata["file_type"]:
        documents.append(doc) 

documents

[Document(id_='44b3cb82-03e2-4a40-93a2-f98ff9ea3086', embedding=None, metadata={'file_path': '/Users/maksim.rostov/pdev/yaub/frontend/simple-yaub/public/posts/bias_variance_decomposition/post.md', 'file_name': 'post.md', 'file_type': 'text/markdown', 'file_size': 12575, 'creation_date': '2025-01-13', 'last_modified_date': '2025-01-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='<center>\n    <h1> Bias-Variance Decomposition </h1>\n</center>\n\n$~$\n\n$~$\n## Why do we care? 🤔\n$~$\n\nThe point of this exercise is to try to create an understanding of what\noverfitting and underfitting mean in the context of machine learning.\n\n$~$\n\

In [28]:
# Automatic parsing into Node's 
# Node is a chunk of text that we run an embedding model on, 
# so instead of doc's we embed smaller chunks, i.e., node's  
parser = SentenceSplitter(chunk_overlap=0) # no overlap between sentences
nodes = parser.get_nodes_from_documents(documents)

In [29]:
nodes[0].__dict__

{'id_': '32b041d7-a1cc-408c-b978-7e167d0bc4ff',
 'embedding': None,
 'metadata': {'file_path': '/Users/maksim.rostov/pdev/yaub/frontend/simple-yaub/public/posts/bias_variance_decomposition/post.md',
  'file_name': 'post.md',
  'file_type': 'text/markdown',
  'file_size': 12575,
  'creation_date': '2025-01-13',
  'last_modified_date': '2025-01-13'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='44b3cb82-03e2-4a40-93a2-f98ff9ea3086', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/Users/maksim.rostov/pdev/yaub/frontend/simple-yaub/public/posts/bias_variance_decomposition/post.md', 'file_name': 'post.md', 'file_type': 'text/markdown', 'file_size': 12575, 'creati

In [30]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

emb_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

for node in nodes: 
    node.embedding = emb_model.get_text_embedding(node.text) 

In [31]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from qdrant_client import QdrantClient

qdrant_client = QdrantClient(":memory:") # Create in-memory Qdrant instance, for testing, CI/CD
# OR client = QdrantClient(path="path/to/db") # Persists changes to disk, fast prototyping

vector_store = QdrantVectorStore(collection_name="blog", client=qdrant_client)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

# TODO: does this insert a nodes into the index? 
index = VectorStoreIndex(nodes, storage_context=storage_context, embed_model=emb_model)


In [32]:
from llama_index.llms.ollama import Ollama  
model = Ollama(model="qwen2:7b")

query_engine = index.as_query_engine(llm=model)
response = query_engine.query("What does author mean by bias and variance?")
print(response)

In this context, "bias" refers to the error between the true target function ($\bar{y}$) and the average prediction of the best estimator ($\bar{h}$). It indicates how much our algorithm is biased towards some other explanation that is not present in the data. Essentially, bias measures the model's systematic error when making predictions.

"Variance," on the other hand, refers to the variability of a model prediction for a given data point or set of data points under different training sets. When variance is high, it means that if we were to train our model with different samples, it might result in very different hypotheses due to more parameters being tuned and higher degrees of freedom. High variance indicates that the model has been overfitting the noise in the training dataset rather than learning the underlying patterns.

Together, bias and variance form what is known as the bias-variance trade-off in machine learning. Balancing these two aspects allows one to achieve a model th

In [33]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='d20428d5-23bb-4424-82df-91df845518d9', embedding=None, metadata={'file_path': '/Users/maksim.rostov/pdev/yaub/frontend/simple-yaub/public/posts/bias_variance_decomposition/post.md', 'file_name': 'post.md', 'file_type': 'text/markdown', 'file_size': 12575, 'creation_date': '2025-01-13', 'last_modified_date': '2025-01-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='44b3cb82-03e2-4a40-93a2-f98ff9ea3086', node_type='4', metadata={'file_path': '/Users/maksim.rostov/pdev/yaub/frontend/simple-yaub/public/posts/bias_variance_decomposition/post.md', 'file_name': 'post.md', 'file_type': 'text/markdown', 'file_size': 12575, 'creation_date': '2025-01-13', 'last_modified_date': '2