In [1]:
import os
import time
from dotenv import load_dotenv

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import google.generativeai as genai
from langchain_text_splitters import RecursiveCharacterTextSplitter

from articleCleaner import clean_all_articles
from articleFetcher import fetchArticles
from embeddingFuncs import embedArticle
from embeddingFuncs import embedChunksAsArticle
from embeddingFuncs import generateQueryEmbedding

# Load environment variables from .env file
load_dotenv()

# Access variables
GOOGLE_GENAI_API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Configure the Google Generative AI library
genai.configure(api_key=GOOGLE_GENAI_API_KEY)

# Configure the Pinecone database
pc = Pinecone(api_key=PINECONE_API_KEY)

# Constants used throughout
EMBEDDING_MODEL = "models/text-embedding-004"
MODEL_MAX_CHUNKS = 9500
CHUNK_OVERLAP = 200

# Fetch & Clean Articles

In [2]:
# Fetch articles
STARTING_PAGE = 1
ENDING_PAGE = 1
articles = fetchArticles(starting_page=STARTING_PAGE, ending_page=ENDING_PAGE)

if (articles != -1):
    # Clean articles
    print(len(articles), "articles successfully fetched")
    clean_all_articles(articles)
else:
    print("Error fetching articles.")

Successfully fetched articles from page 1
100 articles successfully fetched


# Create Pinecone Index (Only Run Once)

In [3]:
# Create new Pinecone index (only run this once for one index)
index_name = "768dim"

# We still don't know what an index is :) maybe we won't ever know
pc.create_index(
    name=index_name,
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ) 
)

# Embed Articles


In [None]:

# Generate embeddings
embeddings = []
for article in articles:
    content = article['content']['rendered']
    article_id = str(article['id'])
    
    # Only run if there is both content to embed and an id to associate it with
    if content and article_id:
        try:
            # Embed the article
            embedArticle(genai, embeddings, EMBEDDING_MODEL, article)
        
        # The article may be too big. In that case, try splitting it into chunks
        except Exception as e:
            print(f"Error generating embedding for article {article_id}: {e} (index {articles.index(article)}). Attempting to split into chunks...")
            
            # Split text into chunks of up to 10000
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=(MODEL_MAX_CHUNKS-CHUNK_OVERLAP), chunk_overlap=CHUNK_OVERLAP)
            texts = text_splitter.split_text(content)

            # Embed the chunks
            embed_success = embedChunksAsArticle(genai, embeddings, EMBEDDING_MODEL, article, texts)

            # If the embed of chunks was successful, print to indicate
            if (embed_success):
                print(f"Successfully split into {len(texts)} chunks.")
    # If the article is missing an id or content, skip it
    else:
        print(f"Skipping article with missing content or ID: {article}")

print(embeddings[0])

Skipping article with missing content or ID: {'id': 474862, 'date': '2024-11-22T11:08:38', 'date_gmt': '2024-11-22T19:08:38', 'guid': {'rendered': 'https://dailybruin.com/?p=474862'}, 'modified': '2024-11-22T11:08:38', 'modified_gmt': '2024-11-22T19:08:38', 'slug': 'rivalry-issue-2024', 'status': 'publish', 'type': 'post', 'link': 'https://features.dailybruin.com/2024/rivalry-issue-2024/', 'title': {'rendered': 'Rivalry Issue 2024'}, 'content': {'rendered': '', 'protected': False}, 'excerpt': {'rendered': '', 'protected': False}, 'author': 5011, 'featured_media': 474863, 'comment_status': 'open', 'ping_status': 'closed', 'sticky': False, 'template': '', 'format': 'standard', 'meta': {'ngg_post_thumbnail': 0}, 'categories': [1435, 12329, 1431], 'tags': [4849], 'acf': {'db_article_format': 'default', 'db_subhead': '', 'db_infobox': '', 'db_display_options': ['mugshot'], 'db_number_of_paws': '0', 'db_link': 'https://features.dailybruin.com/2024/rivalry-issue-2024/', 'db_gallery_id': '', '

In [5]:
print(embeddings[5])

{'id': '474805_chunk0', 'values': [0.019146547, 0.031711392, -0.053437013, -0.02842133, 0.050757315, 0.021991286, 0.06506992, -0.00029112797, -0.03798336, 0.01458556, 0.007812995, 0.051237755, 0.07057972, 0.022170378, 0.016413862, -0.037984904, 0.035407446, 0.044826683, -0.036138467, -0.0132068535, -0.03588374, -0.003093484, 0.0006684202, -0.041698173, -0.054381553, -0.045001123, 0.008250715, -0.0013165585, 0.00810528, -0.008040871, 0.02830547, 0.057916418, 0.038172223, -0.024225697, -0.014857428, 0.05292677, -0.05761495, -0.020760795, 0.028597297, -0.061195113, -0.05368167, 0.032104664, -0.037203066, 0.07186823, 0.029878045, -0.03953896, 0.008316112, -0.015504016, -0.02657268, 0.017227886, -0.056058712, 0.016214356, -0.022564972, 0.006426252, -0.004885253, -0.01744472, -0.04872045, 0.019485505, 0.043086674, 0.012142081, -0.020820944, -0.00947134, 0.030554218, -0.047706258, 0.004535533, 0.011879919, -0.055360377, -0.033900246, -0.09895191, -0.017511383, -0.026995871, 0.0377095, 0.02171

# Upsert Data

In [6]:
# Wait for the index to be ready
while not pc.describe_index("768dim").status['ready']:
    time.sleep(1)

index = pc.Index("768dim")

index.upsert(
    vectors=embeddings
)

upserted_count: 102

In [7]:
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 102}},
 'total_vector_count': 102}


# Query

In [5]:
query = "Tell me about the UCLA vs USC rivalry week."

embedding = generateQueryEmbedding(genai=genai,
                                   embedding_model=EMBEDDING_MODEL,
                                   query=query)

print(embedding)


[0.029556723, 0.043099117, 0.035590913, -0.017413387, -0.0067362585, 0.012077898, 0.0009641394, 0.021458013, -0.03668091, -0.029928058, 0.009321113, -0.00083798805, 0.027120002, 0.05949801, -0.0006805549, -0.036034405, 0.0052870554, 0.031724185, -0.034881968, -0.053180777, 0.0036514574, -0.037847456, 0.011511457, 0.018021103, -0.058684982, 0.014516724, 0.007933762, 0.0044077337, -0.026602382, -0.011998659, -0.0038940387, 8.253991e-05, -0.022759434, -0.034792293, -0.010642159, 0.028791886, 0.02458089, -0.03551998, -0.010502192, 0.0011906658, -0.018374506, 0.06561581, 0.01633788, 0.02481278, 0.06238692, 0.0064852852, 0.015355291, 0.04674555, -0.02337698, 0.05632215, -0.027027585, 0.049947813, -0.009030428, 0.016939009, 0.013650355, 0.009596712, -0.054164264, 0.024588365, -0.026142024, 0.022534428, -0.03352221, -0.0018216543, 0.0031953494, -0.03550093, 0.02772857, 0.0032096684, -0.025771609, 0.02382891, -0.028914468, 0.008326313, -0.03538421, -0.027091246, 0.00824265, 0.021469172, -0.0104

In [None]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)