# Warmup Process to Index and Vectorize Data

## Load Vector Store and Embeddings 

In [1]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [2]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(len(embeddings.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

## Load Data

In [3]:
import pickle

file_path = 'data/scraped_urls.json'

with open(file_path, 'rb') as file:
    raw_data = pickle.load(file)

print(raw_data[0])

[Document(metadata={'og:url': 'https://datascience.cy/programme-structure/', 'language': 'en-US', 'article:published_time': '2021-03-09T20:37:00+00:00', 'ogUrl': 'https://datascience.cy/programme-structure/', 'ogImage': 'https://datascience.cy/wp-content/uploads/2021/03/program-header.jpg', 'generator': ['All in One SEO (AIOSEO) 4.8.1', 'WordPress 6.7.2', 'Powered by Slider Revolution 6.2.23 - responsive, Mobile-Friendly Slider Plugin for WordPress with comfortable drag and drop interface.', 'WP Rocket 3.18.3'], 'og:image:secure_url': 'https://datascience.cy/wp-content/uploads/2021/03/program-header.jpg', 'og:image:height': '500', 'twitter:title': 'Programme structure - Master in Data Science - University of Cyprus', 'og:description': 'The Master in Data Science is a highly-selective programme for students who want to begin or advance their careers in Data Science. The duration of the programme is 1,5-years (90 ECTS), while the language of instruction is English.', 'twitter:image': 'ht

In [4]:
documents_to_add_to_vectorstore = [item[0] for item in raw_data if item]

In [5]:
print(documents_to_add_to_vectorstore[0].page_content)

- [Programme structure](https://datascience.cy/programme-structure/)
- [People](https://datascience.cy/people/)
- [Admissions](https://datascience.cy/admissions/)
- [Capstone Projects](https://datascience.cy/capstone-projects/)
- [Contact](https://datascience.cy/contact/)

[![](https://datascience.cy/wp-content/uploads/2021/03/logo.png)](https://datascience.cy/)

[![](https://datascience.cy/wp-content/uploads/2021/03/logo.png)](https://datascience.cy/)

- [Programme structure](https://datascience.cy/programme-structure/)
- [People](https://datascience.cy/people/)
- [Admissions](https://datascience.cy/admissions/)
- [Capstone Projects](https://datascience.cy/capstone-projects/)
- [Contact](https://datascience.cy/contact/)

### >>>Master

### ... in Data Science

Programme\_Structure

[APPLY NOW](https://applications.ucy.ac.cy/postgraduate_appl/MNG_USER_en.login_frm) [DOWNLOAD BROCHURE](https://datascience.cy/wp-content/uploads/2021/05/Data-Science_Prospectus_lowres.pdf)

### >>Overview


## Add Documents to Vector Store

In [6]:
vector_store.add_documents(documents=documents_to_add_to_vectorstore)

['2d64111c-63b4-42ab-8d88-bcb2e08ab0f6',
 '8cf1cb8f-a130-49f8-abd8-73a084af264a',
 '809ce0e1-ee58-42bd-bb91-f967784f590f',
 '0470a9b1-4fdb-4ca8-b4b6-4027a0b3c5e9',
 '0914bb0a-3b90-4dee-a37e-ce275839aa74',
 '839ab83f-e78d-4f83-a5b6-3641bf8d9f4e',
 '8e042ee7-816a-4711-9c08-434d022fc955',
 '280e5d8c-7609-48c1-8f72-36f622583067',
 'f0757229-80f2-485b-8471-06888d75eb15',
 '9675ad77-6e52-4360-a109-7251935d4818',
 '9c84634e-c869-47df-8548-66f6a76ee1f1',
 '84f2f4a4-7177-4253-ac14-70b5dba5e28b',
 '79573cf7-baf1-4132-b94f-24e5302ef88c',
 '012fd8b1-6208-4248-89d8-08b436a37d28',
 'a7506b2c-4677-4579-b147-42467f1019ea',
 'dcdd50e6-cb93-4fb6-b45d-b8d4efbca13a',
 'd6893210-2d05-4646-a3e5-11438dda8749',
 '8a680c5b-09cf-45ab-9601-c7f3b16b2c6a',
 '78d69b1b-2c14-49bd-8733-b540afed065b',
 '9ba461d9-b60f-46d7-8ebe-9179d49f283f',
 '17cfb0b9-ec80-4039-aa9d-9a03bc005947',
 'a059cf3b-140a-49ba-847d-9f8e71d96524',
 'c90c2df3-d939-4094-8271-a8ca2e2836e1',
 'bdc6c750-2a55-433d-a5f6-1f54cbaf871a',
 '3536003a-5534-

## Test retrieval

In [7]:
results = vector_store.similarity_search_with_score(
    "Master in Data Science?", k=5
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content}")

* [SIM=0.974700] [![](https://www.ucy.ac.cy/graduateschool/wp-content/uploads/sites/45/2022/05/graduate-school-university-of-cyprus-logo-e1655374191364.jpg)](https://www.ucy.ac.cy/graduateschool/?lang=en "Graduate School")

[![](https://www.ucy.ac.cy/graduateschool/wp-content/uploads/sites/45/2023/04/Apply-for-Master-banner-300X200ENG.webp)](https://www.ucy.ac.cy/graduateschool/postgraduate-programmes-places/?lang=en)

[![](https://www.ucy.ac.cy/graduateschool/wp-content/uploads/sites/45/2022/05/0862_UCY_FUNDRAISING_CAMPAIGN_POST_620x400_ENG_002.png)](https://www.ucy.ac.cy/alumni-development/why-i-give/?lang=en)

[![](https://www.ucy.ac.cy/graduateschool/wp-content/uploads/sites/45/2025/01/International_Students_top.jpg)](https://www.ucy.ac.cy/internationalsupport/international-students/)

- [HOME](https://www.ucy.ac.cy/graduateschool/?lang=en)
- GRADUATE SCHOOL

  - [Vision](https://www.ucy.ac.cy/graduateschool/vision/?lang=en)
  - [Structure of the Graduate School](https://www.ucy.ac

## Save Vector Store

In [8]:
vector_store.save_local("faiss_index")

## Load Course List with JSON

In [8]:
import json
from langchain_core.documents import Document

def load_course_data(file_path):
    with open(file_path, 'r') as f:
        courses = json.load(f)
    
    documents = []
    for i, course in enumerate(courses):
        # Convert the course dict to a string representation for the content
        content = str(course)
        # Use the original dict as metadata
        doc = Document(page_content=content, metadata={'source': 'course_list'})
        documents.append(doc)
    
    return documents

In [None]:
try:
    course_docs = load_course_data("data/course_list.json")
    print(f"\nSimple loader: Successfully loaded {len(course_docs)} documents")
except Exception as e:
    print(f"Error with simple loader: {e}")

In [None]:
course_docs[:2]

In [None]:
vector_store.add_documents(documents=course_docs)

## Test Retrieval with filtering

In [None]:
results = vector_store.similarity_search_with_score(
    "Erasmus", k=5, filter={"source": "course_list"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content}")