In [2]:
import os
from dotenv import load_dotenv

from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import google.generativeai as genai

from articleCleaner import clean_all_articles

# Load environment variables from .env file
load_dotenv()

# Access variables
GOOGLE_GENAI_API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Configure the Google Generative AI library
genai.configure(api_key=GOOGLE_GENAI_API_KEY)

# Configure the Pinecone database
pc = Pinecone(api_key=PINECONE_API_KEY)

# Format articles

In [3]:
# Sample articles with HTML and encoded characters
articles = [
    {"id": "article1", "text": "<p>This is an example with &#8217; encoded character.</p>"},
    {"id": "article2", "text": "<p>Another example with <b>bold</b> text and &amp; sign.</p>"},
    {"id": "article3", "text": "<p>Whether it&#8217;s the heartwarming story of two young sisters or the tale of a pirate who falls in love with the moon, UCLA alumni are making their artistic voices heard at an upcoming screening.</p>\n<p>On Nov. 23, Los Angeles filmmakers, including two UCLA alumni, will present their thesis films at the Los Angeles International Children&#8217;s Film Festival-hosted event &#8220;L.A. Grown: A Showcase of Short Thesis Films.&#8221; Among the roster of up-and-coming artists are UCLA alumni Charlotte Oxley, who will showcase the drama &#8220;Purple and Green,&#8221; and Courtney Chapman, presenting the short &#8220;Land Lover.&#8221; The screenings will also include a panel featuring a special guest: Maija Burnett, the director of the character animation program at CalArts, who is also a UCLA alumnus. Oxley said she hopes the festival will inspire the next generation of filmmakers. </p>\n<p>&#8220;I hope that a younger audience coming to one of the LA International Children&#8217;s Film Festival&#8217;s festivals is inspired to create and to keep doing,&#8221; Oxley said. &#8220;I hope that they&#8217;re inspired and continue to be inspired &#8230; and they saw these amazing films, and are like, &#8216;I could do that too.&#8217;&#8221; </p>\n<p><b>[Related:</b><a target=\"_blank\" rel=\"nofollow noopener noreferrer\" href=\"https://dailybruin.com/2024/10/29/girl-meets-world-alumnus-august-maturo-takes-on-love-of-education-at-ucla\"><b> &#8216;Girl Meets World&#8217; alumnus August Maturo takes on love of education at UCLA</b></a><b>]</b></p>\n<p>Oxley&#8217;s film, &#8220;Purple and Green,&#8221; follows two young sisters whose tight-knit bond helps them overcome their parents&#8217; divorce. During the production process, Oxley said she was able to learn the ins and outs of working with child actors. The filmmaker added that she ensured the environment on set was one in which the actresses could stretch their creative muscles without being overwhelmed. Oxley sung the actresses&#8217; praises, saying she discussed exploring these complex themes of family and divorce with the girls and was left amazed by their intelligence and professionalism.</p>\n<figure id=\"attachment_473657\" class=\"thumbnail wp-caption aligncenter\" style=\"width: 640px\"><img alt='Pictured is a film still from Charlotte Oxley&squot;s \"Purple and Green\" showing two girls laying next to each other as they draw in the park. Oxley&squot;s film will screen at The Los Angeles International Children&squot;s Film Festival&squot;s \"L.A. Grown: A Showcase of Short Thesis Films\" on Nov. 23. (Courtesy of Charlotte Oxley)' src=\"https://wp.dailybruin.com/images/2024/11/web.ae_.childrensfilmfestival.courtesy.jpg\" data-uniq-id=\"bd108\" width=\"640\" height=\"817\" class=\"aligncenter wp-image-473657 size-large\" srcset=\"https://wp.dailybruin.com/images/2024/11/web.ae_.childrensfilmfestival.courtesy.jpg 640w, https://wp.dailybruin.com/images/2024/11/web.ae_.childrensfilmfestival.courtesy-235x300.jpg 235w, https://wp.dailybruin.com/images/2024/11/web.ae_.childrensfilmfestival.courtesy-118x150.jpg 118w, https://wp.dailybruin.com/images/2024/11/web.ae_.childrensfilmfestival.courtesy-573x732.jpg 573w, https://wp.dailybruin.com/images/2024/11/web.ae_.childrensfilmfestival.courtesy-501x640.jpg 501w\" sizes=\"(max-width: 640px) 100vw, 640px\" /><figcaption class=\"caption wp-caption-text\">Pictured is a film still from Charlotte Oxley&#39;s &quot;Purple and Green&quot; showing two girls laying next to each other as they draw in the park. Oxley&#39;s film will screen at The Los Angeles International Children&#39;s Film Festival&#39;s &quot;L.A. Grown: A Showcase of Short Thesis Films&quot; on Nov. 23. (Courtesy of Charlotte Oxley)</figcaption></figure>\n<p>Oxley&#8217;s film openly discusses the effects of divorce, but she said one of the most prominent themes is the importance of sisterhood. She said she and her sister have an incredibly close relationship. The alumnus added that she&#8217;s interested in telling the stories of siblings, as well as the effect divorce can have on children.</p>\n<p>&#8220;I was watching a lot of media about divorce &#8230; and I hadn&#8217;t seen a lot of that (media) from the children&#8217;s point of view where it&#8217;s focusing on them,&#8221; Oxley said.</p>\n<p>Oxley said it is important for topics like these not to be dumbed down, even if the intended audience is made up of children and families. The filmmaker said she hopes to emphasize the significance of having a piece of media that children of divorce or young adults who went through the experience as adolescents would be able to fully relate to without the ugliness of the topic being excluded. She said she is honored to be part of this festival and hopes her film will have a positive effect on the future generation of filmmakers.</p>\n<p>&#8220;I&#8217;ve been so grateful for this festival. &#8230; Seeing how children&#8217;s content is such a wide range of things &#8230; It&#8217;s really cool to watch,&#8221; Oxley said. </p>\n<p>Chapman, director of the stop motion film &#8220;Land Lover,&#8221; said she started working with clay when she was around seven years old. The UCLA alumnus said she was in high school when she discovered she could combine her love of clay and theater by creating stop-motion  films. When it came time for her to create her thesis film, she said she decided it would be best to make a film that incorporates her third love: Pirates. The animated film follows Miles, a lone swabbie who is moved to sing in adoration for his love of the moon.</p>\n<p>When the filmmaker was at a loss for a proper shooting space, she said she used her bedroom to film. Over the course of the two-and-a-half-month process, Chapman said she created a miniature pirate ship and ocean, one that made her ability to get to her queen-sized bed more difficult. To accommodate the lost space, she had to get rid of her desk. The animator said her motivation never wavered during the process, as she was creating something that lit a personal creative fire.</p>\n<p>For Chapman, a large part of her infatuation with stop motion comes from making things with her hands, and she said she hopes the younger audience at the festival will be inspired by engaging with the animation. </p>\n<p>&#8220;I think it&#8217;s important for children to see things that challenge them in some sort of way, or inspire them or just make them think,&#8221; Chapman said. &#8220;When you watch stop motion, there is an implicit question of, &#8216;How is this real life?&#8217; &#8230; Getting kids to see that is so fun because it encourages them to be more tactile and curious about the things that they watch.&#8221; </p>\n<p>Another Bruin will be in attendance at the event: Burnett, who graduated from the UCLA School of Theater, Film and Television with an MFA in Animation before becoming CalArts&#8217; Director of Character Animation. Burnett said she is eager to take part in this event since it is affiliated with the very festival in which her first film premiered back in 2006.</p>\n<p>The filmmaker and professor said she has long had a fascination and infatuation for the world of children&#8217;s film &#8211; specifically animation. Burnett said she traces this love back to the animated films aired on Canada&#8217;s public broadcasting network, such as &#8220;The Sweater&#8221; and &#8220;The Man Who Planted Trees.&#8221; She added that much of her inspiration as an animator came from Cartoon Network&#8217;s hit show, &#8220;Ed, Edd n Eddy.&#8221;</p>\n<p><b>[Related: </b><a target=\"_blank\" rel=\"nofollow noopener noreferrer\" href=\"https://dailybruin.com/2024/09/07/suryansu-guha-acknowledges-the-overlooked-workers-in-the-film-industry\"><b>Suryansu Guha acknowledges the overlooked workers in the film industry</b></a><b>]</b></p>\n<p>Burnett said the labor-intensive process of animation is a considerable barrier to overcome for all filmmakers, including those just starting out. The creation of an animated film is an interdisciplinary task, added Burnett &#8211; not only must one have a creative vision, but an understanding of movement, a character&#8217;s motivation and even basic physics is also necessary. She added that the learning curve for these concepts may be steep, yet she encourages aspiring filmmakers to work hard to overcome these challenges and hopes to spread this message by taking part in the screening event. </p>\n<p>&#8220;(I hope they will be) inspired, to maybe pick up a pencil and do some drawing. If members of the audience maybe wanted to and didn&#8217;t feel like they could, or want to or didn&#8217;t have the time, maybe there&#8217;s five minutes to sit down and be able to draw something and express themselves through the page or the screen,&#8221; Burnett said. </p>\n"}
]

cleaned_articles = clean_all_articles(articles)

# Output cleaned articles
for article in cleaned_articles:
    print(f"ID: {article['id']}, Cleaned Text: {article['text']}")

ID: article1, Cleaned Text: This is an example with ’ encoded character.
ID: article2, Cleaned Text: Another example with bold text and & sign.
ID: article3, Cleaned Text: Whether it’s the heartwarming story of two young sisters or the tale of a pirate who falls in love with the moon, UCLA alumni are making their artistic voices heard at an upcoming screening. On Nov. 23, Los Angeles filmmakers, including two UCLA alumni, will present their thesis films at the Los Angeles International Children’s Film Festival-hosted event “L.A. Grown: A Showcase of Short Thesis Films.” Among the roster of up-and-coming artists are UCLA alumni Charlotte Oxley, who will showcase the drama “Purple and Green,” and Courtney Chapman, presenting the short “Land Lover.” The screenings will also include a panel featuring a special guest: Maija Burnett, the director of the character animation program at CalArts, who is also a UCLA alumnus. Oxley said she hopes the festival will inspire the next generation of fi

# Upsert Data

In [None]:
# Create new Pinecone index (only run this once for one index)
index_name = "test2"

# We still don't know what an index is :) maybe we won't ever know
pc.create_index(
    name=index_name,
    dimension=1024, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [11]:
# Sample data
# Eventually, call wordpress endpoint to get articles
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

# model embeddings
# original embedding 
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)

# gemini embedding
"""
model = 'models/embedding-001'
embedding = genai.embed_content(model=model,
                                content=data,
                                task_type="retrieval_document",
                                title=title)
                                
                                
def embed_fn(title, text):
      return genai.embed_content(model=model,
                             content=text,
                             task_type="retrieval_document",
                             title=title)["embedding"]
df['Embeddings'] = data.apply(lambda row: embed_fn(row['Title'], row['Text']), axis=1)                             
"""

print(embeddings[0])

{'values': [0.04913330078125, -0.01306915283203125, ..., -0.0196990966796875, -0.0110321044921875]}


In [12]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

upserted_count: 6

In [13]:
print(index.describe_index_stats())

{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 0}},
 'total_vector_count': 0}


# Query

In [14]:
query = "Tell me about the tech company known as Apple."

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

In [15]:
results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [], 'namespace': 'ns1', 'usage': {'read_units': 1}}
