# Warmup Process to Index and Vectorize Data

## Load Vector Store and Embeddings 

In [2]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [3]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

## Load Data

In [None]:
import pickle

file_path = 'data/scraped_urls.json'

with open(file_path, 'rb') as file:
    raw_data = pickle.load(file)

print(raw_data[0])

In [16]:
documents_to_add_to_vectorstore = [item[0] for item in raw_data if item]

In [None]:
print(documents_to_add_to_vectorstore[0].page_content)

## Add Documents to Vector Store

In [None]:
vector_store.add_documents(documents=documents_to_add_to_vectorstore)

## Test retrieval

In [None]:
results = vector_store.similarity_search_with_score(
    "Master in Data Science?", k=5
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content}")

## Save Vector Store

In [8]:
vector_store.save_local("faiss_index")

## Load Course List with JSON

In [8]:
import json
from langchain_core.documents import Document

def load_course_data(file_path):
    with open(file_path, 'r') as f:
        courses = json.load(f)
    
    documents = []
    for i, course in enumerate(courses):
        # Convert the course dict to a string representation for the content
        content = str(course)
        # Use the original dict as metadata
        doc = Document(page_content=content, metadata={'source': 'course_list'})
        documents.append(doc)
    
    return documents

In [None]:
try:
    course_docs = load_course_data("data/course_list.json")
    print(f"\nSimple loader: Successfully loaded {len(course_docs)} documents")
except Exception as e:
    print(f"Error with simple loader: {e}")

In [None]:
course_docs[:2]

In [None]:
vector_store.add_documents(documents=course_docs)

## Test Retrieval with filtering

In [None]:
results = vector_store.similarity_search_with_score(
    "Erasmus", k=5, filter={"source": "course_list"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content}")

## Firecrawl for UCY Website

In [None]:
from firecrawl import FirecrawlApp, ScrapeOptions

app = FirecrawlApp(api_key="")

# Crawl a website:
crawl_result = app.crawl_url(
  'https://www.ucy.ac.cy/?lang=en', 
  limit=10,
  scrape_options=ScrapeOptions(formats=['markdown']),
)
print(crawl_result)

## Markdown files splitting and indexing

In [5]:
import os
import glob
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document

In [7]:
# --- Configuration for Markdown processing ---
MARKDOWN_FILES_PATH = "data/markdown_extracts/"
headers_to_split_on = [
    ("#", "Header 1"),
]

markdown_files = glob.glob(os.path.join(MARKDOWN_FILES_PATH, "*.md"))
all_split_documents = []

if not markdown_files:
    print(f"No markdown files found in {MARKDOWN_FILES_PATH}")
else:
    print(f"Found {len(markdown_files)} markdown files to process.")

# Initialize the Markdown splitter
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)

for md_file_path in markdown_files:
    filename = os.path.basename(md_file_path)
    print(f"Processing: {filename}...")
    try:
        with open(md_file_path, 'r', encoding='utf-8') as f:
            content = f.read()  # 'content' is a string
        
        # The error indicates that split_text is returning a list of Document objects.
        # Let's call the output 'document_chunks_from_splitter'
        document_chunks_from_splitter = md_splitter.split_text(content)
        
        # Iterate through the Document objects returned by the splitter
        for i, doc_chunk in enumerate(document_chunks_from_splitter):
            # 'doc_chunk' is already a Document object here.
            # We need to add/update its metadata.

            # Ensure the metadata attribute exists and is a dictionary
            if not hasattr(doc_chunk, 'metadata') or doc_chunk.metadata is None:
                doc_chunk.metadata = {}  # Initialize if not present or None
            
            # Add your custom metadata
            doc_chunk.metadata["source"] = filename
            doc_chunk.metadata["chunk_index"] = i
            
            # Append the (now modified) Document object to your list
            all_split_documents.append(doc_chunk)
            
        print(f"  -> Processed into {len(document_chunks_from_splitter)} chunks.")
            
    except Exception as e:
        print(f"Error processing file {filename}: {e}")

print(f"\nTotal document chunks created: {len(all_split_documents)}")

# Check if embeddings are initialized and get dimension
if 'embeddings' not in globals() or embeddings is None:
    raise ValueError("The 'embeddings' object must be initialized before creating the FAISS index.")
try:
    # It's safer to embed a short, constant string to get the dimension
    test_embedding = embeddings.embed_query("dimension_check")
    embedding_dimension = len(test_embedding)
    print(f"Embedding dimension detected: {embedding_dimension}")
except Exception as e:
    raise RuntimeError(f"Could not determine embedding dimension using embeddings.embed_query: {e}")


index = faiss.IndexFlatL2(embedding_dimension) # Use the dynamically determined dimension

vector_store = FAISS(
    embedding_function=embeddings, # This should be your initialized embedding model
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

# --- Add the processed documents to the vector store ---
if all_split_documents:
    print(f"\nAdding {len(all_split_documents)} processed document chunks to FAISS vector store...")
    vector_store.add_documents(all_split_documents)
    print("Documents added successfully.")
else:
    print("\nNo documents were processed to add to the vector store.")

# --- Save the vector store ---
# FAISS_INDEX_SAVE_PATH = "faiss_index_markdown" # Consider a new name to avoid overwriting old index if needed
# print(f"\nSaving FAISS index to: {FAISS_INDEX_SAVE_PATH}")
# vector_store.save_local(FAISS_INDEX_SAVE_PATH)
# print("FAISS index saved.")

# To use this new index in your tools.py, you'd change:
# FAISS.load_local("faiss_index", ...) to FAISS.load_local("faiss_index_markdown", ...)

Found 72 markdown files to process.
Processing: www.ucy.ac.cy_graduateschool_people_board.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_graduateschool_language-of-instruction-and-duration-of-studies.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_itis_policies.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_aasw_student-welfare_housing-office_summer-accommodation.md...
  -> Processed into 1 chunks.
Processing: dsi.education.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_aasw_studies_undergraduate-studies_international_students.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_sap_ma-peace-conflict-democracy.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_internationalsupport_home_faq.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_byz.md...
  -> Processed into 1 chunks.
Processing: www.ucy.ac.cy_graduateschool_postgraduate-programme-places-for-the-fall-semester-of-the-academic-year-2025-2

In [8]:
# --- Save the vector store ---
FAISS_INDEX_SAVE_PATH = "faiss_index" # Consider a new name to avoid overwriting old index if needed
print(f"\nSaving FAISS index to: {FAISS_INDEX_SAVE_PATH}")
vector_store.save_local(FAISS_INDEX_SAVE_PATH)
print("FAISS index saved.")


Saving FAISS index to: faiss_index
FAISS index saved.
