In [None]:
import os
import time
from dotenv import load_dotenv

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import google.generativeai as genai

from articleCleaner import clean_all_articles
from articleFetcher import fetchArticles

# Load environment variables from .env file
load_dotenv()

# Access variables
GOOGLE_GENAI_API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Configure the Google Generative AI library
genai.configure(api_key=GOOGLE_GENAI_API_KEY)

# Configure the Pinecone database
pc = Pinecone(api_key=PINECONE_API_KEY)

# Fetch & Clean Articles

In [5]:
# Fetch articles
STARTING_PAGE = 1
ENDING_PAGE = 3
articles = fetchArticles(starting_page=STARTING_PAGE, ending_page=ENDING_PAGE)

if (articles != -1):
    # Clean articles
    print(len(articles), "articles successfully fetched")
    clean_all_articles(articles)
else:
    print("Error fetching articles.")

Successfully fetched articles from page 1
Successfully fetched articles from page 2
Successfully fetched articles from page 3
300 articles successfully fetched


# Upsert Data

In [None]:
# Create new Pinecone index (only run this once for one index)
index_name = "768dim"

# We still don't know what an index is :) maybe we won't ever know
pc.create_index(
    name=index_name,
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
# Generate embeddings
embeddings = []
for article in articles:
    content = article['content']['rendered']
    article_id = str(article['id'])
    
    if content and article_id:
        try:
            # Generate embedding using Google Gemini
            embedding_response = genai.embed_content(
                  model="models/text-embedding-004",
                  content=content)
            embedding_vector = embedding_response['embedding']
            print(embedding_vector)
            
            # Append to embeddings list
            embeddings.append({
                "id": article_id,
                "values": embedding_vector,
                "metadata": {
                    "date": article['date'],
                    "date_gmt": article['date_gmt'],
                    "link": article['link']
                }
            })
            
        except Exception as e:
            print(f"Error generating embedding for article {article_id}: {e}")
    else:
        print(f"Skipping article with missing content or ID: {article}")

print(embeddings[0])

{'values': [0.04913330078125, -0.01306915283203125, ..., -0.0196990966796875, -0.0110321044921875]}


In [None]:
# Wait for the index to be ready
while not pc.describe_index("768dim").status['ready']:
    time.sleep(1)

index = pc.Index("768dim")

index.upsert(
    vectors=embeddings
)

In [None]:
print(index.describe_index_stats())

# Query

In [None]:
query = "Tell me about the tech company known as Apple."

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [None]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)