In [None]:
import importlib
import pipeline
importlib.reload(pipeline)
from pipeline import Pipeline
import importlib
from modules.retrieval import indexing

importlib.reload(indexing)

from modules.retrieval.indexing import FaissIndex


In [None]:
import os
from pipeline import Pipeline

# Initialize 
pipeline = Pipeline()

corpus_directory = "storage/corpus"
print("Preprocessing the corpus...")
pipeline.preprocess_corpus(corpus_directory, chunking_strategy='sentence', overlap_size=2)
print("Corpus preprocessed and indexed.")

queries = [
    ("Who was Abraham Lincoln?", 15),
    ("Who was Abraham Adams?", 15),
    ("Did Abraham Lincoln live in the Frontier?", 1),
    ("Did Abraham Lincoln live in the Frontier?", 10),
    ("Did Abraham Lincoln live in the Frontier?", 20),
    ("Did Abraham Lincoln live in the Frontier?", 50),
    ("How did Fillmore ascend to the presidency?", 10),  
    ("What is the capital of France?", 15)
]

results = []

for query, k in queries:
    print(f"\nQuery: {query}, k={k}")
    query_embedding = pipeline._Pipeline__encode(query) 
    distances, indices, metadata = pipeline.search_neighbors(query_embedding, k)
    
    result = {
        "query": query,
        "k": k,
        "neighbors": [
            {"distance": distances[0][i], "metadata": metadata[i]} for i in range(len(metadata))
        ]
    }
    results.append(result)

    print("Nearest Neighbors:")
    for i, neighbor in enumerate(result["neighbors"]):
        print(f"Neighbor {i+1}: Distance: {neighbor['distance']}, Metadata: {neighbor['metadata']}")

print("\nAnalysis of Results:")
for res in results:
    print(f"\nQuery: {res['query']}, k={res['k']}")
    print("Nearest Neighbors:")
    for i, neighbor in enumerate(res["neighbors"]):
        print(f"Neighbor {i+1}: Distance: {neighbor['distance']}, Metadata: {neighbor['metadata']}")


Preprocessing the corpus...
Corpus preprocessed and indexed.

Query: Who was Abraham Lincoln?, k=15
Nearest Neighbors:
Neighbor 1: Distance: 0.70039302110672, Metadata: [{'filename': 'S08_set3_a4.txt.clean', 'chunk': 'Abraham Lincoln Abraham Lincoln (February 12, 1809 – April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now."'}, {'filename': 'S08_set3_a4.txt.clean', 'chunk': 'Lincoln is well known for ending slavery in the United States. In 1861 – 1862, however, he made it clear that the North was fighting the war to preserve the Union, not to abolish slavery.'}, {'filename': 'S08_set3_a4.tx

# Task 1 Analysis
## `Query 1`
The results had relevant information about Abraham Lincoln, like his presidency, role in the Civil War, and his necknames like 'Honest Abe'. The system was able to pull several metadata about Abraham Lincoln.
## `Query 2`
This time, the results are mixing Abraham Adams with John Adams and Abigail Adams. This reflects a limitation in the embeddings/corpus context for talking about people who are not as well-known. 
## `Query 3`
For k=1, the relevant chunk directly talks about Lincoln's association with the frontier, talking about the his grandfather and the surrounding context. For k=10 and k=20, the larger values have more context around Lincoln's life and influences but nevertheless have less immediately relevant information. Therefore, lowering the values of k give more precise answers. Higher values will provide more context but it may end up becoming less relevant.
## `Query 4` - k=10
I used k=10 here, and it talks about Fillmore's grwoth due to President Taylor's death and the politics surrounding that event. Essentially, a moderate k value is useful here, because it captures a broad and diverse context but it still answers the relevant question without giving too much. 
## `Query 5` - k=15 
I used k=15 here, and the system brought up many passages about France but did not explicitly say that Paris is teh capital. This means that either the corpus was not enough or the embedding model was not specific enough.

In [None]:
import importlib
from modules.retrieval import indexing

importlib.reload(indexing)

from modules.retrieval.indexing import FaissIndex
import pipeline
importlib.reload(pipeline)
from pipeline import Pipeline

# Initialize 
pipeline = Pipeline(generator_api_key="jLOXM7rhshPrYTHR7qDv4U7uMsRaJIy8")

# Preprocess the corpus
corpus_directory = "storage/corpus"  
pipeline.preprocess_corpus(corpus_directory, chunking_strategy='sentence', overlap_size=2)

# quries and configurations
queries = [
    {"query": "Who was Abraham Lincoln?", "k": 15, "rerank": True},
    {"query": "Who was Abraham Adams?", "k": 15, "rerank": False},
    {"query": "How did Fillmore ascend to the presidency?", "k": 10, "rerank": True},
    {"query": "What trail did Lincoln use a Farmers' Almanac in?", "k": 5, "rerank": False},
    {"query": "What is the capital of France?", "k": 15, "rerank": True},
]

# generate answers
for q in queries:
    print(f"Query: {q['query']} (k={q['k']}, rerank={q['rerank']})")
    answer = pipeline.generate_answer(q["query"], k=q["k"], rerank=q["rerank"])
    print(f"Answer: {answer}\n")


Query: Who was Abraham Lincoln? (k=15, rerank=True)
[ChatCompletionChoice(index=0, message=AssistantMessage(content='Abraham Lincoln was the sixteenth President of the United States, serving from March 4, 1861 until his assassination in 1865. He was an outspoken opponent of the expansion of slavery and played a crucial role in preserving the United States by leading the defeat of the secessionist Confederate States of America during the American Civil War.', tool_calls=None, prefix=False, role='assistant'), finish_reason='stop')]
Answer: Abraham Lincoln was the sixteenth President of the United States, serving from March 4, 1861 until his assassination in 1865. He was an outspoken opponent of the expansion of slavery and played a crucial role in preserving the United States by leading the defeat of the secessionist Confederate States of America during the American Civil War.

Query: Who was Abraham Adams? (k=15, rerank=False)
[ChatCompletionChoice(index=0, message=AssistantMessage(cont

# Task 2 Analysis
## `Query 1`
It successfully retrieved an accurate and detailed response, showing that the pipeline could use both the embedding-based retrieval and reranking effectively when context is in the corpus.
## `Query 2`
Abraham Adams isn't in the corpus, showing that the pipeline figures out when there is no relevant context available. The result didn't use reranking but it was still able to determine that there was no context.  
## `Query 3`
The pipeline had an accurate response as per the history, so it was able to retrieve appropriately and present relevant context if it exists. There was a more concise answer.
## `Query 4` - k=10
It said no context, which is understandable as the corpus did not have the relevant information to answer this question.
## `Query 5` - k=15 
It also said no context, meaning that the corpus could not answer this question either.  