In [1]:
"""Module for fetching data from the SEC EDGAR Archives"""
import json
import os
import re
import requests
from typing import List, Optional, Tuple, Union
import sys

if sys.version_info < (3, 8):
    from typing_extensions import Final
else:
    from typing import Final

import webbrowser

from ratelimit import limits, sleep_and_retry

VALID_FILING_TYPES: Final[List[str]] = [
    "10-K",
    "10-Q",
    "S-1",
    "10-K/A",
    "10-Q/A",
    "S-1/A",
]

SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"


def get_filing(
    cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
) -> str:
    """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
    limits specified on the SEC website.
    ref: https://www.sec.gov/os/accessing-edgar-data"""
    session = _get_session(company, email)
    return _get_filing(session, cik, accession_number)


@sleep_and_retry
@limits(calls=10, period=1)
def _get_filing(
    session: requests.Session, cik: Union[str, int], accession_number: Union[str, int]
) -> str:
    """Wrapped so filings can be retrieved with an existing session."""
    url = archive_url(cik, accession_number)
    response = session.get(url)
    response.raise_for_status()
    return response.text


@sleep_and_retry
@limits(calls=10, period=1)
def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
    """Gets a CIK number from a stock ticker by running a search on the SEC website."""
    cik_re = re.compile(r".*CIK=(\d{10}).*")
    url = _search_url(ticker)
    response = session.get(url, stream=True)
    response.raise_for_status()
    results = cik_re.findall(response.text)
    return str(results[0])


@sleep_and_retry
@limits(calls=10, period=1)
def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
    """Gets retrieves dict of recent SEC form filings for a given cik number."""
    json_name = f"CIK{cik}.json"
    response = session.get(f"{SEC_SUBMISSIONS_URL}/{json_name}")
    response.raise_for_status()
    content = json.loads(response.content)
    recent_forms = content["filings"]["recent"]
    form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])}
    return form_types


def _get_recent_acc_num_by_cik(
    session: requests.Session, cik: Union[str, int], form_types: List[str]
) -> Tuple[str, str]:
    """Returns accession number and form type for the most recent filing for one of the
    given form_types (AKA filing types) for a given cik."""
    retrieved_form_types = get_forms_by_cik(session, cik)
    for acc_num, form_type_ in retrieved_form_types.items():
        if form_type_ in form_types:
            return _drop_dashes(acc_num), form_type_
    raise ValueError(f"No filings found for {cik}, looking for any of: {form_types}")


def get_recent_acc_by_cik(
    cik: str,
    form_type: str,
    company: Optional[str] = None,
    email: Optional[str] = None,
) -> Tuple[str, str]:
    """Returns (accession_number, retrieved_form_type) for the given cik and form_type.
    The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
    """
    session = _get_session(company, email)
    return _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))


def get_recent_cik_and_acc_by_ticker(
    ticker: str,
    form_type: str,
    company: Optional[str] = None,
    email: Optional[str] = None,
) -> Tuple[str, str, str]:
    """Returns (cik, accession_number, retrieved_form_type) for the given ticker and form_type.
    The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q.
    """
    session = _get_session(company, email)
    cik = get_cik_by_ticker(session, ticker)
    acc_num, retrieved_form_type = _get_recent_acc_num_by_cik(session, cik, _form_types(form_type))
    return cik, acc_num, retrieved_form_type


def get_form_by_ticker(
    ticker: str,
    form_type: str,
    allow_amended_filing: Optional[bool] = True,
    company: Optional[str] = None,
    email: Optional[str] = None,
) -> str:
    """For a given ticker, gets the most recent form of a given form_type."""
    session = _get_session(company, email)
    cik = get_cik_by_ticker(session, ticker)
    return get_form_by_cik(
        cik, form_type, allow_amended_filing=allow_amended_filing, company=company, email=email
    )


def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True):
    """Potentialy expand to include amended filing, e.g.:
    "10-Q" -> "10-Q/A"
    """
    assert form_type in VALID_FILING_TYPES
    if allow_amended_filing and not form_type.endswith("/A"):
        return [form_type, f"{form_type}/A"]
    else:
        return [form_type]


def get_form_by_cik(
    cik: str,
    form_type: str,
    allow_amended_filing: Optional[bool] = True,
    company: Optional[str] = None,
    email: Optional[str] = None,
) -> str:
    """For a given CIK, returns the most recent form of a given form_type. By default
    an amended version of the form_type may be retrieved (allow_amended_filing=True).
    E.g., if form_type is "10-Q", the retrived form could be a 10-Q or 10-Q/A.
    """
    session = _get_session(company, email)
    acc_num, _ = _get_recent_acc_num_by_cik(
        session, cik, _form_types(form_type, allow_amended_filing)
    )
    text = _get_filing(session, cik, acc_num)
    return text


def open_form(cik, acc_num):
    """For a given cik and accession number, opens the index page in default browser for the
    associated SEC form"""
    acc_num = _drop_dashes(acc_num)
    webbrowser.open_new_tab(f"{SEC_ARCHIVE_URL}/{cik}/{acc_num}/{_add_dashes(acc_num)}-index.html")


def open_form_by_ticker(
    ticker: str,
    form_type: str,
    allow_amended_filing: Optional[bool] = True,
    company: Optional[str] = None,
    email: Optional[str] = None,
):
    """For a given ticker, opens the index page in default browser for the most recent form of a
    given form_type."""
    session = _get_session(company, email)
    cik = get_cik_by_ticker(session, ticker)
    acc_num, _ = _get_recent_acc_num_by_cik(
        session, cik, _form_types(form_type, allow_amended_filing)
    )
    open_form(cik, acc_num)


def archive_url(cik: Union[str, int], accession_number: Union[str, int]) -> str:
    """Builds the archive URL for the SEC accession number. Looks for the .txt file for the
    filing, while follows a {accession_number}.txt format."""
    filename = f"{_add_dashes(accession_number)}.txt"
    accession_number = _drop_dashes(accession_number)
    return f"{SEC_ARCHIVE_URL}/{cik}/{accession_number}/{filename}"


def _search_url(cik: Union[str, int]) -> str:
    search_string = f"CIK={cik}&Find=Search&owner=exclude&action=getcompany"
    url = f"{SEC_SEARCH_URL}?{search_string}"
    return url


def _add_dashes(accession_number: Union[str, int]) -> str:
    """Adds the dashes back into the accession number"""
    accession_number = str(accession_number)
    return f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}"


def _drop_dashes(accession_number: Union[str, int]) -> str:
    """Converts the accession number to the no dash representation."""
    accession_number = str(accession_number).replace("-", "")
    return accession_number.zfill(18)


def _get_session(company: Optional[str] = None, email: Optional[str] = None) -> requests.Session:
    """Creates a requests sessions with the appropriate headers set. If these headers are not
    set, SEC will reject your request.
    ref: https://www.sec.gov/os/accessing-edgar-data"""
    if company is None:
        company = os.environ.get("SEC_API_ORGANIZATION")
    if email is None:
        email = os.environ.get("SEC_API_EMAIL")
    assert company
    assert email
    session = requests.Session()
    session.headers.update(
        {
            "User-Agent": f"{company} {email}",
            "Content-Type": "text/html",
        }
    )
    return session

In [2]:
text = get_form_by_ticker(
    'aapl',
    '10-K',
    company='Unstructured Technologies',
    email='support@unstructured.io'
)

In [4]:
from unstructured.documents.html import HTMLDocument

html_document = HTMLDocument.from_string(text).doc_after_cleaners(skip_headers_and_footers=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\spurt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\spurt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [5]:
for element in html_document.pages[0].elements[::]: #only from the first page
    print(element)
    print("\n")

UNITED STATES


SECURITIES AND EXCHANGE COMMISSION


Washington, D.C. 20549


FORM 10-K


(Mark One)


ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934


For the fiscal year ended September 24, 2022


or


TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934


For the transition period from


to


Commission File Number:


001-36743


Apple Inc.


(Exact name of Registrant as specified in its charter)


California 94-2404110 (State or other jurisdiction of incorporation or organization) (I.R.S. Employer Identification No.)  One Apple Park Way Cupertino ,  California 95014 (Address of principal executive offices) (Zip Code)


(408) 996-1010


(Registrant’s telephone number, including area code)


Securities registered pursuant to Section 12(b) of the Act:


Title of each class Trading symbol(s) Name of each exchange on which registered Common Stock, $0.00001 par value per share AAPL The Nasdaq Stock Market LLC 1.000

In [7]:
import re
from unstructured.documents.elements import Title

In [8]:
ITEM_TITLE_RE = re.compile(
    r"(?i)item \d{1,3}(?:[a-z]|\([a-z]\))?(?:\.)?(?::)?"
)

In [9]:
def is_10k_item_title(title: str) -> bool:
    """Determines if a title corresponds to a 10-K item heading."""
    return ITEM_TITLE_RE.match(title) is not None

In [10]:
for element in html_document.elements:
    if isinstance(element, Title) and is_10k_item_title(element.text):
        print(element)


Item 1.    Business
Item 1A.    Risk Factors
Item 1B.    Unresolved Staff Comments
Item 2.    Properties
Item 3.    Legal Proceedings
Item 4.    Mine Safety Disclosures
Item 6.    [Reserved]
Item 7.    Management’s Discussion and Analysis of Financial Condition and Results of Operations
Item 7A.    Quantitative and Qualitative Disclosures About Market Risk
Item 8.    Financial Statements and Supplementary Data
Item 9.    Changes in and Disagreements with Accountants on Accounting and Financial Disclosure
Item 9A.    Controls and Procedures
Item 9B.    Other Information
Item 9C.    Disclosure Regarding Foreign Jurisdictions that Prevent Inspections
Item 10.    Directors, Executive Officers and Corporate Governance
Item 11.    Executive Compensation
Item 13.    Certain Relationships and Related Transactions, and Director Independence
Item 14.    Principal Accountant Fees and Services
Item 15.    Exhibit and Financial Statement Schedules
Item 16.    Form 10-K Summary


In [11]:
from unstructured.cleaners.core import clean_extra_whitespace

In [12]:
titles_names = []
titles = []
for element in html_document.elements:
    element.text = clean_extra_whitespace(element.text)
    if isinstance(element, Title) and is_10k_item_title(element.text):
        titles.append(element)
        titles_names.append(element.text)
        print(element)

print(titles_names)

Item 1. Business
Item 1A. Risk Factors
Item 1B. Unresolved Staff Comments
Item 2. Properties
Item 3. Legal Proceedings
Item 4. Mine Safety Disclosures
Item 6. [Reserved]
Item 7. Management’s Discussion and Analysis of Financial Condition and Results of Operations
Item 7A. Quantitative and Qualitative Disclosures About Market Risk
Item 8. Financial Statements and Supplementary Data
Item 9. Changes in and Disagreements with Accountants on Accounting and Financial Disclosure
Item 9A. Controls and Procedures
Item 9B. Other Information
Item 9C. Disclosure Regarding Foreign Jurisdictions that Prevent Inspections
Item 10. Directors, Executive Officers and Corporate Governance
Item 11. Executive Compensation
Item 13. Certain Relationships and Related Transactions, and Director Independence
Item 14. Principal Accountant Fees and Services
Item 15. Exhibit and Financial Statement Schedules
Item 16. Form 10-K Summary
['Item 1. Business', 'Item 1A. Risk Factors', 'Item 1B. Unresolved Staff Comments

In [14]:
section_info = []
for i, el in enumerate(html_document.elements):
    if el.id == titles[0].id:
        break
first_title_index = i
for i in range(first_title_index, first_title_index+30):
    section_info.append(html_document.elements[i].text)

## Using OpenAI

In [16]:
import os
import openai

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')


In [17]:
def find_relevant_section(question, list_elements):
    """Use Open API to attempt to find what section is most relevant to answering a given question

    Args:
        question (String): The question to ask GPT
        list_elements (list): A list of all the titles in the 10-K

    Returns:
        index (integer): The index in the list of the most relevant section
    """

    human_template = f"""You are helping me answer questions about a 10-K document for me. Enclosed by the word START and END, I will give you a list of section titles from a 10-K. Each item in the list is a different section.
    I will then give you a question, and you must tell me which section I am most likely to find the answer to that question.

    Here is the 10-K data:
    START
    {list_elements}
    END

    This is the question: {question}

    Which section is most relevant? Please only give me no text, and only give me the index of the section, with the first item of the list being index 0, that is the most relevant as a single integer. Give me a 1 or 2 digit integer.
    """

    human_message_prompt = {"role": "user", "content": human_template}


    out = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        temperature=0,
        messages=[
            human_message_prompt,
        ],
        api_key=OPENAI_API_KEY
    )
    output = out["choices"][0]["message"]["content"]

    return output

In [18]:
def get_answer(question, index):
    """Use Open API to attempt to get an answer to a question based off one section in the 10-K

    Args:
        question (String): The question to ask GPT
        index (int): The index of the most relevant section

    Returns:
        answer (String): GPT's answer to the question
    """

    #Getting the relevant section
    section_info = []
    start_index_elements = 0
    end_index_elements = len(html_document.elements)
    for i, el in enumerate(html_document.elements):
        if el.id == titles[index].id:
            start_index_elements = i
        if el.id == titles[index+1].id:
            end_index_elements = i
            break
    for i in range(start_index_elements, end_index_elements):
        section_info.append(html_document.elements[i].text)

    human_template = f"""You are analyzing a 10-K document for me. Enclosed by the word START and END, I will give you the data of the 10-K and you will answer a question based ONLY on information you find enclosed in the triple quotes, so you will use no outside information.
    If you cannot find the answer to the question in the information provided, please tell me that you were unable to find an answer.

    Here is the 10-K data:
    START
    {section_info}
    END

    This is the question: {question}
    """

    human_message_prompt = {"role": "user", "content": human_template}


    out = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k",
        temperature=0,
        messages=[
            human_message_prompt,
        ],
        api_key=OPENAI_API_KEY
    )
    answer = out["choices"][0]["message"]["content"]

    return answer


In [19]:
def ask_gpt_question(question):
    """Uses the two functions to find the index, then query GPT to answer the question.

    Args:
        question (String): The question to ask GPT

    Returns:
        answer (String): GPT's answer to the question
    """


    index = int(find_relevant_section(question, titles_names))

    return get_answer(question, index)

In [None]:
print(ask_gpt_question("How does the company generate its revenue? What are its main products or services?"))

## Fine Tuning

In [24]:
openai.api_key = OPENAI_API_KEY

In [28]:
response = openai.File.create(
  file=open("traindata.jsonl", "rb"),
  purpose='fine-tune',
  user_provided_filename="train"
)

file_id = response['id']
print(f"File successfully uploaded with ID: {file_id}")

File successfully uploaded with ID: file-JdN4p42MGSqfW0GX0RXDzKSO


In [29]:
openai.FineTuningJob.list()

<OpenAIObject list at 0x23cd95813b0> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job",
      "id": "ftjob-J5u6waEh6yBnaNr8hTHsFQdC",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1695769609,
      "finished_at": null,
      "fine_tuned_model": null,
      "organization_id": "org-WM0C8DmpfiXb4ySIeUGFLcf2",
      "result_files": [],
      "status": "failed",
      "validation_file": null,
      "training_file": "file-rM7jUv9hku5NUxNtbkx5P9mZ",
      "hyperparameters": {
        "n_epochs": "auto"
      },
      "trained_tokens": null,
      "error": {
        "code": "invalid_training_file",
        "param": "training_file",
        "message": "The job failed due to an invalid training file"
      }
    }
  ],
  "has_more": false
}

In [35]:
response_finetuned = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo") 

#job_id = response['id']

#print(f"Fine-tuning job created successfully with ID: {job_id}")

InvalidRequestError: An error occurred while processing file 'file-JdN4p42MGSqfW0GX0RXDzKSO' and it cannot be used for fine-tuning. Details may be available in the file's status_details.

In [32]:
file_id

'file-JdN4p42MGSqfW0GX0RXDzKSO'

In [34]:
job_id

NameError: name 'job_id' is not defined