In [1]:
import boto3
import pandas as pd
from io import BytesIO

In [2]:
country = "Latvia"
s3_bucket = "eurovoices-news-articles-data"
batch_size = 100
index_name = "eurovoices-news-articles-vdb"
embbeding_model = "voyage-3.5"

In [3]:
s3 = boto3.client("s3")
country_data_s3 = s3.get_object(
    Bucket = s3_bucket,
    Key = f"{country}_master.parquet.gzip"
)
country_data_df = pd.read_parquet(
    BytesIO(country_data_s3["Body"].read()),
    engine = "pyarrow"
).drop_duplicates(subset = "id")
country_data = country_data_df.to_dict(orient="records")

In [4]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=75,
    separators=["\n\n", "\n", ". "]
)

In [5]:
def prepare_document_for_embedding(doc, country):
    id = doc["id"]
    text = doc["content_trans"]
    text_length = len(encoding.encode(text))
    
    if text_length <= 500:
        chunks = [text]
    else:
        chunks = splitter.split_text(text)

    chunk_data_list = [
        {
            "id": id,
            "text": chunk.strip(". "),
            "metadata": {
                "chunk_id" : i,
                "url" : doc["link"],
                "title" : doc["title_trans"],
                "source" : doc["domain_url"],
                "country" : country,
                "pillar_1" : doc["pillar_1"],
                "pillar_2" : doc["pillar_2"],
                "pillar_3" : doc["pillar_3"],
                "pillar_4" : doc["pillar_4"],
                "pillar_5" : doc["pillar_5"],
                "pillar_6" : doc["pillar_6"],
                "pillar_7" : doc["pillar_7"],
                "pillar_8" : doc["pillar_8"],
                "impact_score" : doc["impact_score"],
                "published_date" : doc["published_date"],
            }
        }
        for i, chunk in enumerate(chunks)
    ]
    
    return chunk_data_list

In [6]:
processed_chunks = [
    prepare_document_for_embedding(doc, country=country) 
    for doc in country_data
]
flattened_processed_chunks = [
    item 
    for sublist in processed_chunks 
    for item in sublist
]

In [37]:
import os
import voyageai
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv()
vc = voyageai.Client(os.getenv("VOYAGEAI_API_KEY"))
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"), pool_threads=30)

In [8]:
if not pc.has_index(index_name):
    pc.create_index(
        index_name,
        dimension = 1024,
        spec = ServerlessSpec(
            cloud = "aws",
            region = "us-east-1"
        ),
        metric = "cosine"
    )

In [None]:
def embbed_and_ingest(
        batch, vc, pc, 
        country,
        index_name=index_name,
        embbeding_model=embbeding_model
):
    ids = [item["id"] for item in batch]
    texts = [item["text"] for item in batch]
    metadata = [item["metadata"] for item in batch]

    embeddings = vc.embed(
        texts,
        model = embbeding_model,
        input_type = "document"
    ).embeddings

    vectors = [
        {"id": a, "values": b, "metadata": c} 
        for a,b,c in zip(ids, embeddings, metadata)
    ]

    with pc.Index(index_name, pool_threads=30) as index:
        async_result = index.upsert(
            vectors=vectors, 
            namespace="testing",
            async_req=True
        ) 

    return async_result.get()

In [14]:
for i in range(0, len(flattened_processed_chunks), batch_size):
    r = embbed_and_ingest(
        flattened_processed_chunks[i : i + batch_size], 
        vc=vc, pc=pc,
        country=country,
        index_name=index_name
    )