# Collection setup and data loading

Learn how to create Weaviate collections and import data with vectorization. We'll create the FinancialArticles collection used in other notebooks.

In [1]:
%store -r WEAVIATE_IP
print(f'Weaviate IP: {WEAVIATE_IP}')

Weaviate IP: 44.244.93.188


In [2]:
from boto3 import Session

session = Session()
credentials = session.get_credentials()
current_credentials = credentials.get_frozen_credentials()

AWS_ACCESS_KEY = current_credentials.access_key
AWS_SECRET_KEY = current_credentials.secret_key
AWS_SESSION_TOKEN = current_credentials.token
print(f"AWS_ACCESS_KEY:\t{AWS_ACCESS_KEY}")
print(f"AWS_SECRET_KEY:\t{AWS_SECRET_KEY}")
print(f"AWS_SESSION_TOKEN:\t{AWS_SESSION_TOKEN}")
print(f"WEAVIATE_IP:\t{WEAVIATE_IP}")

AWS_ACCESS_KEY:	ASIAZQ3DRV23M62CAX3O
AWS_SECRET_KEY:	8GIIhK6j1VLNiC4NLLwqqwByYv4Gcwj08DUZq86e
AWS_SESSION_TOKEN:	IQoJb3JpZ2luX2VjEOT//////////wEaCXVzLXdlc3QtMiJHMEUCIH7jBGqbZRB8+cbasyXv16WUuw6z0vCkDL/vTyP1PVF+AiEA/n07WlEFLVqYDOvNySKRdAZggC0AHAISRaUP3cv8RrIqhQMITRAAGgw2NTQ2NTQ0MTg2MTQiDKldH5lkmzYQy+DoYyriAoxcp7ZP2Vynn1bmL2h/MRumtoNzxAHMfX0yfmajcZagGzQSotu25YLgMdaj2xmk2Ec+dYYZv5VXDmzTuhfeKuWIlnx6zV5nAEh0aT9nU4sGVG44PvjfeDBqBlRbYUIeKpcAqfzSReGjHAP1hg1woE65tAuytdmV4zFgDAPdOjeNmWv/BW6dztPH0mruLiB5r3ibOef3vS4Mw7UW5C+klwSwrkTZspWW1CLdcl9TH+GA0ZalUX0ksyu7vSUY1C7Kd5igWEkCJy3FLj+qnZMBm6wyktDvmr7vGYhtSL07hNTY0SBUqh2FMUE3nYkoLmuGti1xe0Q4wyuMK/gCl5hZOZfvVxEZ6QCcRSlr80Izb9D8yH7Ouj8hDCJ8rfrUB8FNhhSMrXUWJIMSnEd3YvbcjDscAjlEE0kOYmD4QowYJLgeKQQ8YMvB6WSIjVSBEi+nf81TigcaPRdp9F8rNlir5Q96IjC/teLFBjqbAQT4BPZNCoCCFecxsbTPy+amB+6X5x3vTbaxLrFHQ/uqIuOAcxn0eXw6F53VicMoYLQb6JQhOS3uELAzy/TNQ29aRY4U8OfwBehPQLJdhBZB1qyIxjRoR7B2b2srIRZVvE6qC1kPeg4+Pbu8ghEhMSaVTP4LjllrbG9iTPadcEfe0QiFu75Nc4FhTUKCLd9zXXtGdCVtBT7fX+xH
WE

## Connect to Weaviate

Connect to a Weaviate instance using environment variables.

In [3]:
import weaviate
import os
from weaviate.classes.init import Auth

client = weaviate.connect_to_local(
    WEAVIATE_IP,
    headers = {
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SESSION_TOKEN,
    }        
)

client.is_ready()

True

## Create collection with vectorizer

Create a collection for financial articles with separate embeddings for title and content.

[Docs - Collection configuration](https://weaviate.io/developers/weaviate/manage-data/collections)

In [13]:
from weaviate.classes.config import Configure, Property, DataType

# Delete collection if it exists
if client.collections.exists("FinancialArticles"):
    client.collections.delete("FinancialArticles")

# Create collection with named vectors for title and content
client.collections.create(
    name="FinancialArticles",

    # Configure separate embeddings for title and article content
    vector_config=[
        Configure.Vectors.text2vec_aws(
            name="title",
            source_properties=["article_title"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"            
            # model="cohere.embed-multilingual-v3"
        ),
        Configure.Vectors.text2vec_aws(
            name="content",
            source_properties=["article"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"
            # model="cohere.embed-multilingual-v3"
        )
    ],

    # Define property schema
    properties=[
        Property(name="article_title", data_type=DataType.TEXT),
        Property(name="article", data_type=DataType.TEXT),
        Property(name="url", data_type=DataType.TEXT),
    ],
)

<weaviate.collections.collection.sync.Collection at 0x7f84192937d0>

## Load financial articles data

Import the financial news dataset using batch processing for efficient loading.

In [14]:
import pandas as pd
from tqdm import tqdm
from weaviate.util import generate_uuid5

# Load the financial articles dataset
df = pd.read_parquet("data/fin_news_articles_5000.parquet")
df = df[:100]

print(f"Loaded {len(df)} articles")
print("Sample data:")
df.head()

Loaded 100 articles
Sample data:


Unnamed: 0,date,article_title,stock_symbol,url,article
0,2022-08-04 00:00:00 UTC,Ansys Q2 22 Earnings Conference Call At 8:30 A...,ANSS,https://www.nasdaq.com/articles/ansys-q2-22-ea...,(RTTNews) - Ansys Inc. (ANSS) will host a conf...
1,2012-11-26 00:00:00 UTC,Corrections Corporation of America (CXW) Ex-Di...,PAYX,https://www.nasdaq.com/articles/corrections-co...,Corrections Corporation of America ( CXW ) has...
2,2023-07-14 00:00:00 UTC,"Technology Sector Update for 07/14/2023: ASML,...",ASML,https://www.nasdaq.com/articles/technology-sec...,"Tech stocks were lower late Friday, with the T..."
3,2020-02-16 00:00:00 UTC,3 Best Biotech Stocks to Buy for the Next Decade,VRTX,https://www.nasdaq.com/articles/3-best-biotech...,Let me start out by acknowledging that predict...
4,2021-08-24 00:00:00 UTC,iShares U.S. Transportation ETF Experiences Bi...,CSX,https://www.nasdaq.com/articles/ishares-u.s.-t...,Looking today at week-over-week shares outstan...


In [15]:
# Get collection reference
articles = client.collections.use("FinancialArticles")

# Import data with batch processing
with articles.batch.fixed_size(batch_size=100, concurrent_requests=2) as batch:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        # Create object from dataframe row
        obj = {
            "article_title": row["article_title"],
            "article": row["article"],
            "url": row["url"] if "url" in row else ""
        }

        # Generate UUID to prevent duplicates
        uuid = generate_uuid5(row["article_title"] + str(row.get("url", "")))

        batch.add_object(
            properties=obj,
            uuid=uuid
        )

        # Check for errors during import
        if batch.number_errors > 10:
            print("Too many errors during import")
            break

print(f"\nImported {len(articles)} articles")

100%|██████████| 100/100 [00:00<00:00, 9666.75it/s]
{'message': 'Failed to send 1 in a batch of 100', 'errors': {"couldn't invoke amazon.titan-embed-text-v2:0 model: operation error Bedrock Runtime: InvokeModel, https response error StatusCode: 400, RequestID: d490e8af-937f-4ca5-b473-bf30d8dce2aa, ValidationException: 400 Bad Request: Too many input tokens. Max input tokens: 8192, request input token count: 9078 "}}
{'message': 'Failed to send 1 objects in a batch of 100. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}



Imported 99 articles


## Check for import errors

In [16]:
if len(articles.batch.failed_objects) > 0:
    print(f"Import completed with {len(articles.batch.failed_objects)} errors")
    for err in articles.batch.failed_objects[:5]:  # Show first 5 errors
        print(err)
else:
    print("Import completed successfully with no errors")

Import completed with 1 errors
ErrorObject(message="couldn't invoke amazon.titan-embed-text-v2:0 model: operation error Bedrock Runtime: InvokeModel, https response error StatusCode: 400, RequestID: d490e8af-937f-4ca5-b473-bf30d8dce2aa, ValidationException: 400 Bad Request: Too many input tokens. Max input tokens: 8192, request input token count: 9078 ", object_=BatchObject(collection='FinancialArticles', properties={'article_title': 'Why a Dividend Cut Can Be a Good Sign', 'article': 'In this episode of Motley Fool Money, Chris Hill chats with Motley Fool analysts Jason Moser, Ron Gross, and Andy Cross about the latest news. They talk about:\nThe potential impact of the slowdown over the coming year.\nThe importance of value and price when hunting for stocks at a discount.\nTwo great stock recommendations to put on you watch ist and much more.\nTo catch full episodes of all The Motley Fool\'s free podcasts, check out our podcast center. To get started investing, check out our quick-st

## Verify the data

Check that articles were imported correctly with embeddings.

In [17]:
# Check collection size
print(f"Total articles in collection: {len(articles)}")

# Show sample data
response = articles.query.fetch_objects(limit=3)

for item in response.objects:
    print(f"Title: {item.properties['article_title']}")
    print(f"Content preview: {item.properties['article'][:100]}...\n")

Total articles in collection: 99
Title: Why Twenty-First Century Fox Stock Jumped 39% Last Year
Content preview: What happened
Shares of Twenty-First Century Fox (NASDAQ: FOXA) surged last year as the entertainmen...

Title: Accenture forecasts fourth-quarter revenue below estimates on forex hit
Content preview: adds forecast, recasts lead, shares
June 23 (Reuters) - IT services company Accenture Plc ACN.N fore...

Title: Noteworthy Tuesday Option Activity: GILD, CMI, AR
Content preview: Looking at options trading activity among components of the Russell 3000 index, there is noteworthy ...



In [18]:
# Verify embeddings were created
response = articles.query.fetch_objects(
    limit=2,
    include_vector=["title", "content"]
)

for item in response.objects:
    print(f"Article: {item.properties['article_title']}")
    print(f"Title vector dimension: {len(item.vector['title'])}")
    print(f"Content vector dimension: {len(item.vector['content'])}\n")

Article: Why Twenty-First Century Fox Stock Jumped 39% Last Year
Title vector dimension: 1024
Content vector dimension: 1024

Article: Accenture forecasts fourth-quarter revenue below estimates on forex hit
Title vector dimension: 1024
Content vector dimension: 1024



## Test basic search

Quick test to verify the collection works with semantic search.

In [19]:
# Test semantic search on titles
response = articles.query.near_text(
    query="technology earnings",
    target_vector="title",
    limit=3
)

print("Search results for 'technology earnings':")
for item in response.objects:
    print(f"- {item.properties['article_title']}")

Search results for 'technology earnings':
- Technology Sector Update for 07/14/2023: ASML, WDC, META, OPRA
- Wednesday Sector Laggards: Utilities, Technology & Communications
- Applied Materials (AMAT) Surpasses Q1 Earnings and Revenue Estimates


## Close the client

Always close your connection when finished.

In [20]:
client.close()