# Collection setup and data loading

Learn how to create Weaviate collections and import data with vectorization. We'll create the FinancialArticles collection used in other notebooks.

In [1]:
%store -r WEAVIATE_IP
%store -r AWS_ACCESS_KEY
%store -r AWS_SECRET_KEY
%store -r AWS_SESSION_TOKEN

print(f"AWS_ACCESS_KEY:\t{AWS_ACCESS_KEY}")
print(f"AWS_SECRET_KEY:\t{AWS_SECRET_KEY}")
print(f"AWS_SESSION_TOKEN:\t{AWS_SESSION_TOKEN}")
print(f"WEAVIATE_IP:\t{WEAVIATE_IP}")

AWS_ACCESS_KEY:	ASIA4V3I6UDOEZYI4HFO
AWS_SECRET_KEY:	H6+hoiYD1vOCHvn5t8PW3cZM3RVauVzsV4HbzWXh
AWS_SESSION_TOKEN:	IQoJb3JpZ2luX2VjEPP//////////wEaCXVzLXdlc3QtMiJHMEUCIQCKR9zHGTDb/VJ6QRHjCqg8kZ/9UQJH2Fv7Q3xCN+Ym4gIgcHMs8FQ1OZCmUIIMtI8htPzhcBJ2ZjszzaNRrxAyJykqlwMIXBAEGgw4NzE1NjE2MDEyNDQiDNYHcHtRimH5YAed5yr0Agl3l4NP21g7WdeBSUJ58Ezv576I7LjEkNbwOABt2PVU5/6hxPU0A5KnX9TQr2TwJipDzd8xSbROX3qwFDBAb9qTWFniCk/3UfW6lCwTWVpc7gQWdpKtsdKoRWrZclMBsY6tIGJEnr/Fc8kydYFq/icLv4OJEo8IYq2vIw8YwYQUyS2JU20VMa8itvphB9008A1BGz8AgOfsGGcoeaJWxbH/VfykjD2XB8rTf8Ayk7Qm4xnhUA96Dg2vZM7ULg598uJtqJprsU3k+Ml+z/sRehvWo/TN2l+UFNI6M9WKEkVO1w8vpdyjsQIRsVUc3YDdTzRffY80wKQ9utNF3HfutnxQ1tJVI53cyL6ZIU9pkIvm464aL6DvqifYCwTXXVr6Z/lEz4kLXrY/EGISQVbQTYBZZnc3iH9zYExX1gElKDoFgAgB3ouocKkFz3AE0Sy9WgiOa1l0U1yjinkHXPpqKIkx5c7uUdTrVZYQYA9PA/1ScI9CoTCf1+XFBjqaAZn3vUwdmwzSTKFZAxdLHK5uW2b/FiI4hHjJoW26q+xFX/s3VnAx+2WC3fgcjTUShcV/vEz/KhjzX1qVvZ0gJtm1giAOQ5fc6fMEpN+G1MgpTbEN5GMZgCvRBBV8rP2ygTqzA3WzNiFcyHPVrjfH777JFXQC0V+nOk3DuGKttud4IGQnOYtK7k2PYgQ

## Connect to Weaviate

Connect to a Weaviate instance using environment variables.

In [2]:
import weaviate
import os
from weaviate.classes.init import Auth

client = weaviate.connect_to_local(
    WEAVIATE_IP,
    headers = {
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SESSION_TOKEN,
    }        
)

client.is_ready()

True

## Create collection with vectorizer

Create a collection for financial articles with separate embeddings for title and content.

[Docs - Collection configuration](https://weaviate.io/developers/weaviate/manage-data/collections)

In [7]:
from weaviate.classes.config import Configure, Property, DataType

# Delete collection if it exists
if client.collections.exists("FinancialArticles"):
    client.collections.delete("FinancialArticles")

# Create collection with named vectors for title and content
client.collections.create(
    name="FinancialArticles",

    # Configure separate embeddings for title and article content
    vector_config=[
        Configure.Vectors.text2vec_aws(
            name="title",
            source_properties=["article_title"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"            
            # model="cohere.embed-multilingual-v3"
        ),
        Configure.Vectors.text2vec_aws(
            name="content",
            source_properties=["article"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"
            # model="cohere.embed-multilingual-v3"
        )
    ],

    # Define property schema
    properties=[
        Property(name="article_title", data_type=DataType.TEXT),
        Property(name="article", data_type=DataType.TEXT),
        Property(name="url", data_type=DataType.TEXT),
    ],
)

<weaviate.collections.collection.sync.Collection at 0x7f82b43f3080>

## Load financial articles data

Import the financial news dataset using batch processing for efficient loading.

In [8]:
import pandas as pd
from tqdm import tqdm
from weaviate.util import generate_uuid5

# Load the financial articles dataset
df = pd.read_parquet("data/fin_news_articles_5000.parquet")

print(f"Loaded {len(df)} articles")
print("Sample data:")
df.head()

Loaded 5000 articles
Sample data:


Unnamed: 0,date,article_title,stock_symbol,url,article
0,2022-08-04 00:00:00 UTC,Ansys Q2 22 Earnings Conference Call At 8:30 A...,ANSS,https://www.nasdaq.com/articles/ansys-q2-22-ea...,(RTTNews) - Ansys Inc. (ANSS) will host a conf...
1,2012-11-26 00:00:00 UTC,Corrections Corporation of America (CXW) Ex-Di...,PAYX,https://www.nasdaq.com/articles/corrections-co...,Corrections Corporation of America ( CXW ) has...
2,2023-07-14 00:00:00 UTC,"Technology Sector Update for 07/14/2023: ASML,...",ASML,https://www.nasdaq.com/articles/technology-sec...,"Tech stocks were lower late Friday, with the T..."
3,2020-02-16 00:00:00 UTC,3 Best Biotech Stocks to Buy for the Next Decade,VRTX,https://www.nasdaq.com/articles/3-best-biotech...,Let me start out by acknowledging that predict...
4,2021-08-24 00:00:00 UTC,iShares U.S. Transportation ETF Experiences Bi...,CSX,https://www.nasdaq.com/articles/ishares-u.s.-t...,Looking today at week-over-week shares outstan...


In [9]:
# Get collection reference
articles = client.collections.use("FinancialArticles")

# Import data with batch processing
with articles.batch.fixed_size(batch_size=100) as batch:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        # Create object from dataframe row
        obj = {
            "article_title": row["article_title"],
            "article": row["article"],
            "url": row["url"] if "url" in row else ""
        }

        # Generate UUID to prevent duplicates
        uuid = generate_uuid5(row["article_title"] + str(row.get("url", "")))

        batch.add_object(
            properties=obj,
            uuid=uuid
        )

        # Check for errors during import
        if batch.number_errors > 10:
            print("Too many errors during import")
            break

print(f"\nImported {len(articles)} articles")

100%|██████████| 5000/5000 [01:10<00:00, 71.15it/s]



Imported 5000 articles


## Check for import errors

In [6]:
if len(articles.batch.failed_objects) > 0:
    print(f"Import completed with {len(articles.batch.failed_objects)} errors")
    for err in articles.batch.failed_objects[:5]:  # Show first 5 errors
        print(err)
else:
    print("Import completed successfully with no errors")

Import completed successfully with no errors


## Verify the data

Check that articles were imported correctly with embeddings.

In [None]:
# Check collection size
print(f"Total articles in collection: {len(articles)}")

# Show sample data
response = articles.query.fetch_objects(limit=3)

for item in response.objects:
    print(f"Title: {item.properties['article_title']}")
    print(f"Content preview: {item.properties['article'][:100]}...\n")

In [None]:
# Verify embeddings were created
response = articles.query.fetch_objects(
    limit=2,
    include_vector=["title", "content"]
)

for item in response.objects:
    print(f"Article: {item.properties['article_title']}")
    print(f"Title vector dimension: {len(item.vector['title'])}")
    print(f"Content vector dimension: {len(item.vector['content'])}\n")

## Test basic search

Quick test to verify the collection works with semantic search.

In [None]:
# Test semantic search on titles
response = articles.query.near_text(
    query="technology earnings",
    target_vector="title",
    limit=3
)

print("Search results for 'technology earnings':")
for item in response.objects:
    print(f"- {item.properties['article_title']}")

## Close the client

Always close your connection when finished.

In [None]:
client.close()