# Search Wikipedia data - Vector queries

Explore semantic search capabilities using the imported Wikipedia collection with pre-computed embeddings.

## Connect to Weaviate

Connect to the Weaviate instance containing our Wikipedia collection.

In [None]:
import weaviate
import os
from weaviate.classes.init import Auth
# from weaviate.classes.init import AdditionalConfig, Timeout

# client = weaviate.connect_to_custom(
#     http_host="<http_host>",
#     http_port="<http_port>",
#     grpc_host="<grpc_host>",
#     grpc_port="<grpc_port>",
# )

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=os.environ["WCD_TEST_URL"],
    auth_credentials=os.environ["WCD_TEST_KEY"]
)

client.is_ready()

## Verify Wikipedia collection

Check that our Wikipedia collection is available and populated.

In [None]:
# Get collection and check size
wiki = client.collections.get("Wiki")
total_articles = len(wiki)

print(f"Wikipedia collection contains {total_articles:,} articles")

# Quick preview of collection contents
if total_articles > 0:
    sample = wiki.query.fetch_objects(limit=1)
    article = sample.objects[0].properties
    print(f"\nSample article: '{article['title']}'")
    print(f"Content preview: {article['text'][:100]}...")
else:
    print("⚠️  No articles found. Please run 2.2-wiki-import-complete.ipynb first.")

## Basic semantic search

Perform vector-based searches on Wikipedia articles.

In [None]:
# Search for articles about musical instruments
response = wiki.query.near_text(
    query="musical instruments",
    limit=5,
    target_vector="main_vector"
)

print("🎵 Articles about musical instruments:")
print("=" * 40)

for i, article in enumerate(response.objects, 1):
    props = article.properties
    print(f"\n{i}. {props['title']}")
    print(f"   Preview: {props['text'][:120]}...")
    print(f"   URL: {props['url']}")

## Advanced search examples

Explore different types of queries and topics.

### Science and Technology

In [None]:
response = wiki.query.near_text(
    query="artificial intelligence machine learning",
    limit=4,
    target_vector="main_vector"
)

print("🤖 AI and Machine Learning articles:")
for i, article in enumerate(response.objects, 1):
    print(f"{i}. {article.properties['title']}")

### Historical Topics

In [None]:
response = wiki.query.near_text(
    query="ancient Rome Roman Empire history",
    limit=4,
    target_vector="main_vector"
)

print("🏛️  Roman Empire and ancient history:")
for i, article in enumerate(response.objects, 1):
    print(f"{i}. {article.properties['title']}")

### Natural Sciences

In [None]:
response = wiki.query.near_text(
    query="biology evolution species animals",
    limit=4,
    target_vector="main_vector"
)

print("🧬 Biology and evolution articles:")
for i, article in enumerate(response.objects, 1):
    print(f"{i}. {article.properties['title']}")

## Search with metadata

Include similarity scores and additional metadata in search results.

In [None]:
from weaviate.classes.query import MetadataQuery

response = wiki.query.near_text(
    query="space exploration NASA astronauts",
    limit=5,
    target_vector="main_vector",
    return_metadata=MetadataQuery(distance=True)
)

print("🚀 Space exploration articles with similarity scores:")
print("=" * 55)

for i, article in enumerate(response.objects, 1):
    props = article.properties
    distance = article.metadata.distance
    similarity = 1 - distance  # Convert distance to similarity
    
    print(f"\n{i}. {props['title']}")
    print(f"   Similarity: {similarity:.3f} (distance: {distance:.3f})")
    print(f"   Preview: {props['text'][:100]}...")

## Comparative searches

Compare search results for related but different topics.

In [None]:
queries = [
    "classical music composers",
    "jazz music musicians",
    "rock music bands"
]

print("🎼 Comparing different music genres:")
print("=" * 40)

for query in queries:
    response = wiki.query.near_text(
        query=query,
        limit=3,
        target_vector="main_vector"
    )
    
    print(f"\n📝 Query: '{query}'")
    for i, article in enumerate(response.objects, 1):
        print(f"  {i}. {article.properties['title']}")

## Explore collection diversity

Sample random articles to understand the breadth of topics in our Wikipedia collection.

In [None]:
# Get a diverse sample of articles
import random

response = wiki.query.fetch_objects(limit=20)
sample_articles = random.sample(response.objects, min(8, len(response.objects)))

print("🌍 Random sample of Wikipedia articles in our collection:")
print("=" * 60)

for i, article in enumerate(sample_articles, 1):
    props = article.properties
    print(f"\n{i}. {props['title']}")
    print(f"   Wiki ID: {props['wiki_id']}")
    print(f"   Content: {props['text'][:80]}...")

## Search quality analysis

Test search precision with specific queries.

In [None]:
test_queries = [
    "programming languages Python Java",
    "European countries geography",
    "Olympic sports athletics swimming",
    "renewable energy solar wind power"
]

print("🔍 Testing search quality across different domains:")
print("=" * 55)

for query in test_queries:
    response = wiki.query.near_text(
        query=query,
        limit=2,
        target_vector="main_vector",
        return_metadata=MetadataQuery(distance=True)
    )
    
    print(f"\n🔎 '{query}'")
    for i, article in enumerate(response.objects, 1):
        similarity = 1 - article.metadata.distance
        print(f"  {i}. {article.properties['title']} (similarity: {similarity:.3f})")

## Summary

This notebook demonstrated:
- Basic semantic search on Wikipedia articles
- Advanced queries across different topics and domains
- Similarity scoring and metadata analysis
- Comparative searches to understand semantic relationships
- Collection exploration and quality assessment

The pre-vectorized Wikipedia collection provides a rich dataset for testing various vector search scenarios and understanding semantic similarity in practice.

## Close the client

Always close your connection when finished.

In [None]:
client.close()