In this notebook we populate a vector db with the title, description and tags of all datasets, models and pipelines from EOTDL.

In [5]:
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from langchain_community.embeddings import HuggingFaceEmbeddings

load_dotenv()

True

In [6]:
# Setup Qdrant client
qdrant_url = os.getenv('QDRANT_URL', 'http://localhost:6333')
qdrant_api_key = os.getenv('QDRANT_API_KEY', '')

print(f"Connecting to Qdrant at: {qdrant_url}")
try:
    if qdrant_api_key:
        qdrant_client = QdrantClient(url=qdrant_url, api_key=qdrant_api_key)
    else:
        qdrant_client = QdrantClient(url=qdrant_url)
    
    # Test connection
    collections = qdrant_client.get_collections()
    print("✅ Successfully connected to Qdrant!")
    print(f"Existing collections: {[c.name for c in collections.collections]}")
except Exception as e:
    print(f"❌ Failed to connect to Qdrant: {e}")
    print("Make sure Qdrant is running: docker-compose up -d eotdl-qdrant")


Connecting to Qdrant at: http://localhost:6333
✅ Successfully connected to Qdrant!
Existing collections: []


In [7]:
# Setup embeddings model (same as in your MCP server)
model_name = "nasa-impact/nasa-smd-ibm-st-v2"
encode_kwargs = {"normalize_embeddings": True}

print(f"Loading embedding model: {model_name}")
try:
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        encode_kwargs=encode_kwargs
    )
    print("✅ Embedding model loaded successfully!")
except Exception as e:
    print(f"❌ Failed to load embedding model: {e}")
    print("Make sure you have the required dependencies installed")


Loading embedding model: nasa-impact/nasa-smd-ibm-st-v2
✅ Embedding model loaded successfully!


In [12]:
collection_name = "EOTDL"

# Test the vector database with a sample query
def test_vector_search(query: str, k: int = 3):
    """Test the vector database with a sample query"""
    
    print(f"\n🔍 Testing vector search with query: '{query}'")
    
    try:
        # Generate embedding for the query
        query_embedding = embeddings.embed_query(query)
        
        # Search in Qdrant
        search_results = qdrant_client.search(
            collection_name=collection_name,
            query_vector=query_embedding,
            limit=k
        )
        
        print(f"Found {len(search_results)} results:")
        
        for i, result in enumerate(search_results, 1):
            print(f"\n--- Result {i} (Score: {result.score:.4f}) ---")
            print(f"Type: {result.payload.get('type', 'Unknown')}")
            print(f"Name: {result.payload.get('name', 'No Name')}")
            print(f"Description: {result.payload.get('description', 'No description')[:200]}...")
            print(f"Tags: {result.payload.get('tags', 'No tags')}")
            
    except Exception as e:
        print(f"❌ Error during search: {e}")

# Test with some sample queries
test_queries = [
    "satellite imagery",
    "machine learning model",
    "earth observation data",
    "remote sensing pipeline"
]

for query in test_queries:
    test_vector_search(query)
    print("\n" + "="*50)



🔍 Testing vector search with query: 'satellite imagery'
Found 3 results:

--- Result 1 (Score: 0.5056) ---
Type: dataset
Name: EuroSAT-RGB
Description: <h1><strong>EuroSAT: A land use and land cover classification dataset based on Sentinel-2 satellite images.</strong></h1><p><br></p><p><a href="https://arxiv.org/abs/1709.00029" rel="noopener noreferr...
Tags: []

--- Result 2 (Score: 0.4124) ---
Type: dataset
Name: EuroSAT-small
Description: # EuroSAT-small

This is a small subet of the EuroSAT dataset....
Tags: []

--- Result 3 (Score: 0.3795) ---
Type: pipeline
Name: EuroCropsPipeline
Description: # EuroCropsPipeline

This pipeline will extract features from a S1 and S2 time series for a given set of parcels in the EuroCrops dataset....
Tags: []


🔍 Testing vector search with query: 'machine learning model'
Found 3 results:

--- Result 1 (Score: 0.4598) ---
Type: pipeline
Name: EuroCropsPipeline
Description: # EuroCropsPipeline

This pipeline will extract features from a S1 and S2 t

  search_results = qdrant_client.search(
