In [1]:
import os
import time
from dotenv import load_dotenv

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import google.generativeai as genai
from langchain_text_splitters import RecursiveCharacterTextSplitter

from articleCleaner import clean_all_articles
from articleCleaner import clean_article
from articleFetcher import fetchArticles
from articleFetcher import fetchArticleById
from embeddingFuncs import embedArticle
from embeddingFuncs import embedChunksAsArticle
from embeddingFuncs import generateQueryEmbedding

# Load environment variables from .env file
load_dotenv()

# Access variables
GOOGLE_GENAI_API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Configure the Google Generative AI library
genai.configure(api_key=GOOGLE_GENAI_API_KEY)

# Configure the Pinecone database
pc = Pinecone(api_key=PINECONE_API_KEY)

# Constants used throughout
EMBEDDING_MODEL = "models/text-embedding-004"
MODEL_MAX_CHUNKS = 9500
CHUNK_OVERLAP = 200

# Fetch & Clean Articles

In [16]:
# Fetch articles
STARTING_PAGE = 1
ENDING_PAGE = 3
articles = fetchArticles(starting_page=STARTING_PAGE, ending_page=ENDING_PAGE)

if (articles != -1):
    # Clean articles
    print(len(articles), "articles successfully fetched")
    clean_all_articles(articles)
else:
    print("Error fetching articles.")

Successfully fetched articles from page 1
Successfully fetched articles from page 2
Successfully fetched articles from page 3
300 articles successfully fetched


# Create Pinecone Index (Only Run Once)

In [3]:
# Create new Pinecone index (only run this once for one index)
index_name = "768dim"

# We still don't know what an index is :) maybe we won't ever know
pc.create_index(
    name=index_name,
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-west-2"
    ) 
)

# Embed Articles


In [17]:

# Generate embeddings
embeddings = []
for article in articles:
    content = article['content']['rendered']
    article_id = str(article['id'])
    
    # Only run if there is both content to embed and an id to associate it with
    if content and article_id:
        try:
            # Embed the article
            embedArticle(genai, embeddings, EMBEDDING_MODEL, article)
        
        # The article may be too big. In that case, try splitting it into chunks
        except Exception as e:
            print(f"Error generating embedding for article {article_id}: {e} (index {articles.index(article)}). Attempting to split into chunks...")
            
            # Split text into chunks of up to 10000
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=(MODEL_MAX_CHUNKS-CHUNK_OVERLAP), chunk_overlap=CHUNK_OVERLAP)
            texts = text_splitter.split_text(content)

            # Embed the chunks
            embed_success = embedChunksAsArticle(genai, embeddings, EMBEDDING_MODEL, article, texts)

            # If the embed of chunks was successful, print to indicate
            if (embed_success):
                print(f"Successfully split into {len(texts)} chunks.")
    # If the article is missing an id or content, skip it
    else:
        print(f"Skipping article with missing content or ID: {article}")

print(f"Successfully embedded {len(embeddings)} articles")

Skipping article with missing content or ID: {'id': 474939, 'date': '2024-11-23T16:38:50', 'date_gmt': '2024-11-24T00:38:50', 'guid': {'rendered': 'https://dailybruin.com/?p=474939'}, 'modified': '2024-11-25T14:35:18', 'modified_gmt': '2024-11-25T22:35:18', 'slug': 'live-ucla-vs-usc', 'status': 'publish', 'type': 'post', 'link': 'https://dailybruin.com/category/breaking/ucla-vs-usc-2024', 'title': {'rendered': 'UCLA football succumbs to USC in 2024 Battle for Los Angeles – as it happened'}, 'content': {'rendered': '', 'protected': False}, 'excerpt': {'rendered': '', 'protected': False}, 'author': 5669, 'featured_media': 475047, 'comment_status': 'open', 'ping_status': 'closed', 'sticky': False, 'template': '', 'format': 'standard', 'meta': {'ngg_post_thumbnail': 0}, 'categories': [1435, 1431], 'tags': [4849], 'acf': {'db_article_format': 'default', 'db_subhead': '', 'db_infobox': '', 'db_display_options': ['mugshot'], 'db_number_of_paws': '0', 'db_link': 'https://dailybruin.com/categor

# Upsert Data

In [25]:
# Wait for the index to be ready
while not pc.describe_index("768dim").status['ready']:
    time.sleep(1)

index = pc.Index("768dim")

index.upsert(
    vectors=embeddings
)

upserted_count: 302

In [27]:
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 302}},
 'total_vector_count': 302}


# Query

In [30]:
query = "How did UCLA handle stresses caused by the election?"

embedding = generateQueryEmbedding(genai=genai,
                                   embedding_model=EMBEDDING_MODEL,
                                   query=query)

# Wait for the index to be ready
while not pc.describe_index("768dim").status['ready']:
    time.sleep(1)

index = pc.Index("768dim")

results = index.query(
    vector=embedding,
    top_k=5,
    include_values=False,
    include_metadata=True
)

# Generate Response


In [31]:
model = genai.GenerativeModel("gemini-1.5-flash")

context = ""

for result in results['matches']:
    id = result['id']
    article = fetchArticleById(id)
    cleanedArticle = clean_article(article)

    link = result['metadata']['link']
    context += f"""\nARTICLE START (Source: {link})\n
    {cleanedArticle}
    \nARTICLE END\n
    """

instructions = f"""
You are an expert in whatever context is provided. Provide only factual information that you can back up using the context. Only mention facts, while keeping a light tone. Act like you are responding direclty to a question as a human.
DO NOT SHARE REFERENCE URLS THAT ARE NOT INCLUDED IN THE CONTEXT BLOCK.
You will not apologize for previous responses, but instead will indicate new information was gained.
If user asks about or refers to the current "workspace" AI will refer to the the content after START CONTEXT BLOCK and before END OF CONTEXT BLOCK as the CONTEXT BLOCK. 
If you are asked to give quotes, please bias towards providing reference links to the original source of the quote.
You will take into account any CONTEXT BLOCK that is provided in a conversation. It will say it does not know if the CONTEXT BLOCK is empty.
You will not invent anything that is not drawn directly from the context.
You will not answer questions that are not related to the context.
The question that is being asked is below. Respond directly to this question only with the context provided.
START QUESTION BLOCK
{query}
END QUESTION BLOCK

START CONTEXT BLOCK
{context}
END OF CONTEXT BLOCK
"""

response = model.generate_content(instructions)
print(response.text)

Based on the provided text, UCLA offered resources to help students manage election-related stress.  Octave, a mental health service, had outreach teams at UCLA voting centers on Election Day, providing resources like breathing exercises and access to therapy.  Many students reported increased stress due to the election, citing impacts on motivation, sleep, and focus.  Some students felt more animosity in this election than previous ones, and the uncertainty surrounding the results caused significant stress, particularly for non-citizens.  UCLA also hosted several election watch parties, which provided a community setting for students to share reactions and process their emotions.

