In [45]:
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup

load_dotenv()
client = OpenAI()
pc = Pinecone(api_key=os.getenv('PINECONE_DEFAULT_API_KEY'))

In [22]:
def extract_text(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    div = soup.find('div', class_="article-body")
    eles = div.find('h2', string='Prepared Remarks:').find_next_siblings("p")
    full_text = ' '.join([e.text for e in eles])
    return full_text

In [23]:
t = extract_text('https://www.fool.com/earnings/call-transcripts/2025/01/30/apple-aapl-q1-2025-earnings-call-transcript/')

In [26]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words := text.split()), chunk_size):
        chunk =  ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

chunks = chunk_text(t)

In [28]:
print(len(chunks))
chunks[0]

17


"Suhasini Chandramouli -- Director, Investor Relations Good afternoon, and welcome to the Apple Q1 fiscal year 2025 earnings conference call. My name is Suhasini Chandramouli, Director of investor relations. Today's call is being recorded. Speaking first today are Apple CEO, Tim Cook; and he'll be followed by CFO, Kevan Parekh. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including, without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation, and future business outlook, including the potential impact of macroeconomic conditions on the company's business and results of operations. These statements involve risks and uncertainties that may cause actual results or trends to differ materially from our forecast. For more information, please refer to the risk factors discussed in Appl

In [32]:
def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

In [33]:
embedding = get_embedding(chunks[0])
print(f"Embedding length: {len(embedding)}")

Embedding length: 1536


In [46]:
# Create pinecone index if it doesn't exist
index_name = 'needle-earnings-transcripts'
if not pc.has_index(index_name):
    print("Index not found, creating new")
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )
index = pc.Index(host="https://needle-earnings-transcripts-e38g0na.svc.aped-4627-b74a.pinecone.io")

In [38]:
metadata = {
    "company": "Apple",
    "year": 2025,
    "quarter": 1,
    "document": "Earnings Transcript",
    "url": 'https://www.fool.com/earnings/call-transcripts/2025/01/30/apple-aapl-q1-2025-earnings-call-transcript/',
    "snippet": chunks[0][:200]
}
index.upsert([
    ("apple-earnings-transcript-Q1-chunk-0", embedding, metadata)
])

{'upserted_count': 1}

In [40]:
# TODO: dynamic metadata
def embed_and_upsert(chunks, index):
    for i, c in enumerate(chunks, 1):
        metadata = {
            "company": "Apple",
            "year": 2025,
            "quarter": 1,
            "document": "Earnings Transcript",
            "url": 'https://www.fool.com/earnings/call-transcripts/2025/01/30/apple-aapl-q1-2025-earnings-call-transcript/',
            "snippet": c[:200]
        }
        embedding = get_embedding(c)
        index.upsert([
            (f"apple-earnings-transcript-Q1-chunk-{i}", embedding, metadata)
        ])

embed_and_upsert(chunks[1:], index)


In [44]:
query = "risks related to supply chain"
query_embedding = get_embedding(query)

result = index.query(
    vector=query_embedding,
    top_k=3,
    include_metadata=True,
    include_values=False
)
print(result)

{'matches': [{'id': 'apple-earnings-transcript-Q1-chunk-10',
              'metadata': {'company': 'Apple',
                           'document': 'Earnings Transcript',
                           'quarter': 1.0,
                           'snippet': 'margins flattish on a sequential basis. '
                                      'Typically, I think it tends to be '
                                      'guided up a little bit, 50 basis '
                                      'points, so sequentially. Can we just '
                                      'touch on what are the offsets of the '
                                      'puts and takes you ',
                           'url': 'https://www.fool.com/earnings/call-transcripts/2025/01/30/apple-aapl-q1-2025-earnings-call-transcript/',
                           'year': 2025.0},
              'score': 0.28052029,
              'values': []},
             {'id': 'apple-earnings-transcript-Q1-chunk-5',
              'metadata': {'comp