In [49]:
import os
import re
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from datetime import datetime
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup

load_dotenv()
client = OpenAI()
pc = Pinecone(api_key=os.getenv('PINECONE_DEFAULT_API_KEY'))

In [45]:
# Create pinecone index if it doesn't exist
index_name = 'needle-earnings-transcripts'
if not pc.has_index(index_name):
    print("Index not found, creating new")
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )
index = pc.Index(host="https://needle-earnings-transcripts-e38g0na.svc.aped-4627-b74a.pinecone.io")

In [58]:
def extract_text(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    date = soup.find('span', id='date').text
    div = soup.find('div', class_="article-body")
    eles = div.find('h2', string='Prepared Remarks:').find_next_siblings("p")
    full_text = ' '.join([e.text for e in eles])
    return full_text, date

def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words := text.split()), chunk_size):
        chunk =  ' '.join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

def get_embedding(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def embed_and_upsert(chunks, index, url, company, ticker, quarter, year, date):
    for i, c in enumerate(chunks, 0):
        metadata = {
            "company": company.capitalize(),
            "symbol": ticker.upper(),
            "date": date,
            "year": year,
            "quarter": quarter,
            "document": "Earnings Transcript",
            "url": url,
            "snippet": c[:200]
        }
        embedding = get_embedding(c)
        index.upsert([
            (f"{company}-earnings-transcript-{year}Q{quarter}-chunk-{i}", embedding, metadata)
        ])

def format_date(s):
    date_obj = datetime.strptime(s, "%b %d, %Y")
    return date_obj.strftime("%m-%d-%Y")

def parse_url(url):
    URL_REGEX = r'call-transcripts\/\d{4}\/\d{2}\/\d{2}\/(\w+)-(\w+)-q(\d)-(\d{4})'
    res = re.findall(URL_REGEX, url)[0]
    return res

def extract_and_upsert(url, index):
    t, date = extract_text(url)
    chunks = chunk_text(t)
    company_name, ticker, quarter, year = parse_url(url)
    embed_and_upsert(chunks, index, url, company_name, ticker, quarter, year, format_date(date))

In [59]:
# Begin here!
extract_and_upsert(
    'https://www.fool.com/earnings/call-transcripts/2024/07/24/tesla-tsla-q2-2024-earnings-call-transcript/',
    index
)

In [None]:
"""
Already pulled
    'https://www.fool.com/earnings/call-transcripts/2024/10/23/tesla-tsla-q3-2024-earnings-call-transcript/',
    'https://www.fool.com/earnings/call-transcripts/2025/02/05/alphabet-goog-q4-2024-earnings-call-transcript/',
    'https://www.fool.com/earnings/call-transcripts/2025/01/30/apple-aapl-q1-2025-earnings-call-transcript/'
"""

In [63]:
for u in [
    'https://www.fool.com/earnings/call-transcripts/2024/10/31/apple-aapl-q4-2024-earnings-call-transcript/',
    'https://www.fool.com/earnings/call-transcripts/2023/02/02/apple-aapl-q1-2023-earnings-call-transcript/',
    'https://www.fool.com/earnings/call-transcripts/2022/10/27/apple-aapl-q4-2022-earnings-call-transcript/',
    'https://www.fool.com/earnings/call-transcripts/2022/07/28/apple-aapl-q3-2022-earnings-call-transcript/',
    'https://www.fool.com/earnings/call-transcripts/2020/07/31/apple-aapl-q3-2020-earnings-call-transcript.aspx'
]:
    name, _, q, y = parse_url(u) 
    extract_and_upsert(u, index)
    print(f"Upserted {name.capitalize()} earnings from {y}Q{q}.")

Upserted apple earnings from 2024Q4.
Upserted apple earnings from 2023Q1.
Upserted apple earnings from 2022Q4.
Upserted apple earnings from 2022Q3.
Upserted apple earnings from 2020Q3.


In [36]:
# url = 'https://www.fool.com/earnings/call-transcripts/2025/01/30/apple-aapl-q1-2025-earnings-call-transcript/'
# url = 'https://www.fool.com/earnings/call-transcripts/2025/02/05/alphabet-goog-q4-2024-earnings-call-transcript/'
url = 'https://www.fool.com/earnings/call-transcripts/2024/10/23/tesla-tsla-q3-2024-earnings-call-transcript/'
t = extract_text(url)
chunks = chunk_text(t)

In [37]:
print(len(chunks))
chunks[0]

20


"Travis Axelrod -- Head of Investor Relations Good afternoon, everyone, and welcome to Tesla's fourth quarter 2024 q&a webcast. My name is Travis Axelrod, the here at Tesla. And I'm joined today by Elon Musk and Vaibhav Taneja and a number of other executives. Our Q4 results were announced at about 3:00 p.m. Central Time in the update deck we published at the same link as this webcast. During this call, we will discuss our business outlook and make forward-looking statements. These comments are based on predictions and expectations as of today. Actual events or results could differ materially due to a number of risks and uncertainties, including those mentioned in our most recent filings with the SEC. During the question-and-answer portion of today's call, please limit yourself to one question and one follow-up. [Operator instructions] Before we jump into Q&A, Elon has some opening remarks. Elon? Elon Reeve Musk -- Chief Executive Officer and Product Architect Thank you. So, in summary

In [38]:
metadata = {
    "company": "Apple",
    "year": 2025,
    "quarter": 1,
    "document": "Earnings Transcript",
    "url": url,
    "snippet": chunks[0][:200]
}
index.upsert([
    ("apple-earnings-transcript-Q1-chunk-0", embedding, metadata)
])

{'upserted_count': 1}

In [64]:
query = "pandemic issues"
query_embedding = get_embedding(query)

result = index.query(
    vector=query_embedding,
    top_k=3,
    include_metadata=True,
    include_values=False
)
print(result)

{'matches': [{'id': 'apple-earnings-transcript-2023Q1-chunk-14',
              'metadata': {'company': 'Apple',
                           'date': '02-02-2023',
                           'document': 'Earnings Transcript',
                           'quarter': '1',
                           'snippet': '-- the iPhone has become so integral '
                                      "into people's lives. It contains their "
                                      'contacts and their health information '
                                      'and their banking information and their '
                                      'smart home and so many different parts '
                                      'of thei',
                           'symbol': 'AAPL',
                           'url': 'https://www.fool.com/earnings/call-transcripts/2023/02/02/apple-aapl-q1-2023-earnings-call-transcript/',
                           'year': '2023'},
              'score': 0.295121133,
              'values'

In [27]:
for ids in index.list():
    print(ids)

['apple-earnings-transcript-2025Q1-chunk-0', 'apple-earnings-transcript-2025Q1-chunk-1', 'apple-earnings-transcript-2025Q1-chunk-10', 'apple-earnings-transcript-2025Q1-chunk-11', 'apple-earnings-transcript-2025Q1-chunk-12', 'apple-earnings-transcript-2025Q1-chunk-13', 'apple-earnings-transcript-2025Q1-chunk-14', 'apple-earnings-transcript-2025Q1-chunk-15', 'apple-earnings-transcript-2025Q1-chunk-16', 'apple-earnings-transcript-2025Q1-chunk-2', 'apple-earnings-transcript-2025Q1-chunk-3', 'apple-earnings-transcript-2025Q1-chunk-4', 'apple-earnings-transcript-2025Q1-chunk-5', 'apple-earnings-transcript-2025Q1-chunk-6', 'apple-earnings-transcript-2025Q1-chunk-7', 'apple-earnings-transcript-2025Q1-chunk-8', 'apple-earnings-transcript-2025Q1-chunk-9', 'apple-earnings-transcript-Q1-chunk-0', 'apple-earnings-transcript-Q1-chunk-1', 'apple-earnings-transcript-Q1-chunk-10', 'apple-earnings-transcript-Q1-chunk-11', 'apple-earnings-transcript-Q1-chunk-12', 'apple-earnings-transcript-Q1-chunk-13', 

In [28]:
ids_to_delete = ['apple-earnings-transcript-Q1-chunk-0', 'apple-earnings-transcript-Q1-chunk-1', 'apple-earnings-transcript-Q1-chunk-10', 'apple-earnings-transcript-Q1-chunk-11', 'apple-earnings-transcript-Q1-chunk-12', 'apple-earnings-transcript-Q1-chunk-13', 'apple-earnings-transcript-Q1-chunk-14', 'apple-earnings-transcript-Q1-chunk-15', 'apple-earnings-transcript-Q1-chunk-16', 'apple-earnings-transcript-Q1-chunk-17', 'apple-earnings-transcript-Q1-chunk-18', 'apple-earnings-transcript-Q1-chunk-2', 'apple-earnings-transcript-Q1-chunk-3', 'apple-earnings-transcript-Q1-chunk-4', 'apple-earnings-transcript-Q1-chunk-5', 'apple-earnings-transcript-Q1-chunk-6', 'apple-earnings-transcript-Q1-chunk-7', 'apple-earnings-transcript-Q1-chunk-8', 'apple-earnings-transcript-Q1-chunk-9']
index.delete(ids_to_delete)


{}

In [57]:
index.delete(delete_all=True)

{}