#### Tutorial linke below:
#### https://github.com/dgunning/edgartools

In [9]:
import pandas as pd
from edgar import *

# Tell the SEC who you are
set_identity("erinyyu3@gmail.com")

In [1]:
pip install edgartools

Collecting edgartools
  Downloading edgartools-2.10.1-py3-none-any.whl.metadata (26 kB)
Collecting fastcore>=1.5.29 (from edgartools)
  Downloading fastcore-1.5.29-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx>=0.25.0 (from edgartools)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting humanize>=4.0.0 (from edgartools)
  Downloading humanize-4.9.0-py3-none-any.whl.metadata (7.9 kB)
Collecting markdownify>0.11.0 (from edgartools)
  Downloading markdownify-0.11.6-py3-none-any.whl.metadata (7.3 kB)
Collecting pandas>=2.0.0 (from edgartools)
  Downloading pandas-2.2.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pyarrow>=14.0.0 (from edgartools)
  Downloading pyarrow-15.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting rank-bm25==0.2.1 (from edgartools)
  Downloading rank_bm25-0.2.1-py3-none-any.whl.metadata (3.1 kB)
Collecting retry>=0.9.2 (from edgartools)
  Downloading retry-0.9.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collec

Note: you may need to restart the kernel to use updated packages.


In [111]:
# do I need to get financial statements from 10-K filing or 
# I can directly query the balance sheet and income statements

filings = get_filings(year=range(2014, 2024),form="10-K")
#filings

df = filings.to_pandas()

# create a copy of the original data received from EDGAR
df_copy = df.copy()

In [112]:
#df['filing_date'] = pd.to_datetime(df['filing_date'])
#df['year'] = df['filing_date'].dt.year
df_copy = df_copy.loc[df_copy['form']=='10-K'] # to exclude form == 10-K/A which does not usually have the financial statements
df_copy



#df.groupby(['company']).agg({'year': 'count', 'month': 'count'})

Unnamed: 0,form,company,cik,filing_date,accession_number
0,10-K,"Atmos Energy Kansas Securitization I, LLC",1967097,2023-12-29,0001967097-23-000004
1,10-K,"Citius Pharmaceuticals, Inc.",1506251,2023-12-29,0001213900-23-099889
2,10-K,FLANIGANS ENTERPRISES INC,12040,2023-12-29,0001174947-23-001489
3,10-K,GLOBAL TECHNOLOGIES LTD,932021,2023-12-29,0001493152-23-046428
4,10-K,MariaDB plc,1929589,2023-12-29,0001929589-23-000010
...,...,...,...,...,...
84700,10-K,PHOTRONICS INC,810136,2014-01-03,0001140361-14-000609
84701,10-K,Vanguard Energy Corp,1497649,2014-01-03,0001354488-14-000018
84704,10-K,CASTLE HOLDING CORP,802510,2014-01-02,0001477932-14-000009
84705,10-K,Hydrogen Future Corp,1381054,2014-01-02,0001354488-14-000016


In [81]:
# Read the CSV file into a DataFrame
df_500 = pd.read_csv('sp500_component.csv')
df_500

Unnamed: 0,tickers_wiki,CIK_wiki
0,MMM,66740
1,AOS,91142
2,ABT,1800
3,ABBV,1551152
4,ACN,1467373
...,...,...
498,YUM,1041061
499,ZBRA,877212
500,ZBH,1136869
501,ZION,109380


In [113]:
# find the filings from companies that are in the sp500 index
df_sec_sp500 = pd.merge(df_500['CIK_wiki'], df_copy, left_on='CIK_wiki', right_on='cik', how='left')
df_sec_sp500 = df_sec_sp500.drop(columns = ['cik'])
df_sec_sp500 = df_sec_sp500.drop_duplicates()

In [114]:
df_sec_sp500

Unnamed: 0,CIK_wiki,form,company,filing_date,accession_number
0,66740,10-K,3M CO,2023-02-08,0000066740-23-000014
1,66740,10-K,3M CO,2022-02-09,0000066740-22-000010
2,66740,10-K,3M CO,2021-02-04,0001558370-21-000737
3,66740,10-K,3M CO,2020-02-06,0001558370-20-000581
4,66740,10-K,3M CO,2019-02-07,0001558370-19-000470
...,...,...,...,...,...
4817,1555280,10-K,Zoetis Inc.,2018-02-15,0001555280-18-000053
4818,1555280,10-K,Zoetis Inc.,2017-02-16,0001555280-17-000044
4819,1555280,10-K,Zoetis Inc.,2016-02-24,0001555280-16-000344
4820,1555280,10-K,Zoetis Inc.,2015-02-27,0001555280-15-000057


In [115]:
# calculate how many filings each cik has
counts = df_sec_sp500['CIK_wiki'].value_counts()
value_counts_df = counts.reset_index()
value_counts_df

Unnamed: 0,CIK_wiki,count
0,66740,10
1,1103982,10
2,753308,10
3,1564708,10
4,1164727,10
...,...,...
495,1841666,2
496,1967680,1
497,1932393,1
498,1944048,1


In [118]:
# create a subset of sp500 constituents companies that have 10 years of historical data
cik_with10 = value_counts_df[value_counts_df['count']==10]
len(cik_with10) # 451 sp500 constituents have full 10 years of history

451

In [121]:
df_in_scope = pd.merge(cik_with10['CIK_wiki'], df_sec_sp500, on='CIK_wiki', how='left')
df_in_scope

Unnamed: 0,CIK_wiki,form,company,filing_date,accession_number
0,66740,10-K,3M CO,2023-02-08,0000066740-23-000014
1,66740,10-K,3M CO,2022-02-09,0000066740-22-000010
2,66740,10-K,3M CO,2021-02-04,0001558370-21-000737
3,66740,10-K,3M CO,2020-02-06,0001558370-20-000581
4,66740,10-K,3M CO,2019-02-07,0001558370-19-000470
...,...,...,...,...,...
4505,1156039,10-K,"Anthem, Inc.",2018-02-21,0001156039-18-000003
4506,1156039,10-K,"Anthem, Inc.",2017-02-22,0001156039-17-000002
4507,1156039,10-K,"Anthem, Inc.",2016-02-19,0001156039-16-000018
4508,1156039,10-K,"Anthem, Inc.",2015-02-24,0001156039-15-000003
