**Table of contents**<a id='toc0_'></a>    
- [Libraries](#toc1_)    
- [Find current companies with CIK codes](#toc2_)    
- [Find basic information on companies](#toc3_)    
- [Download the index files listing all filings by year and quarter](#toc4_)    
- [Download and store text files](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc1_'></a>[Libraries](#toc0_)

In [10]:
import os
import time
os.chdir(os.environ.get('PROJECT_PATH'))
from secnlp.ml_logic import data as d
from secnlp import utils as u
from secnlp.params import *
import pandas as pd

# <a id='toc2_'></a>[Find current companies with CIK codes](#toc0_)

In [2]:
companies_list = d.current_edgar_companies_list(agent = AGENT)
print(companies_list)

           ticker                                name
cik                                                  
0000320193   AAPL                          Apple inc.
0000789019   MSFT                      Microsoft corp
0001652044  GOOGL                       Alphabet inc.
0001018724   AMZN                      Amazon com inc
0001045810   NVDA                         Nvidia corp
...           ...                                 ...
0001921332  AZTGY              Aztech global ltd./adr
0001788756   PFTY  Parallel flight technologies, inc.
0001748680  OWSCX              1ws credit income fund
0000013372  NSARO                   Nstar electric co
0001795938   CEAI                       Creations inc

[8191 rows x 2 columns]


# <a id='toc3_'></a>[Find basic information on companies](#toc0_)

In [3]:
cik_list = companies_list.index.values[:10]
companies_info = d.basic_info_company(cik_list=cik_list, agent = AGENT)
print(companies_info)

             sic                                     sicDescription tickers  \
cik                                                                           
0000320193  3571                               Electronic Computers    AAPL   
0000789019  7372                      Services-Prepackaged Software    MSFT   
0001652044  7370  Services-Computer Programming, Data Processing...   GOOGL   
0001018724  5961                 Retail-Catalog & Mail-Order Houses    AMZN   
0001045810  3674                   Semiconductors & Related Devices    NVDA   
0001326801  7370  Services-Computer Programming, Data Processing...    META   
0001067983  6331                  Fire, Marine & Casualty Insurance   BRK-B   
0001318605  3711              Motor Vehicles & Passenger Car Bodies    TSLA   
0000059478  2834                        Pharmaceutical Preparations     LLY   
0001403161  7389                    Services-Business Services, NEC       V   

           exchanges fiscalYearEnd  
cik           

# <a id='toc4_'></a>[Download the index files listing all filings by year and quarter](#toc0_)

In [None]:
filings_10k_2023_q4 = d.bulk_download_url_filings(start_year = 2023, end_year = 2023, quarters = ['QTR1'], agent = AGENT, uncompress = True)

# <a id='toc5_'></a>[Download and store text files](#toc0_)

In [6]:
df = u.read_data_from_bq(credentials = SERVICE_ACCOUNT, gcp_project = PROJECT, bq_dataset = DATASET_ID, table = FILINGS_10KQ_TABLE_ID)

In [7]:
# Define a subset of rows (e.g., rows with index 0 and 1)
condition_subset = df['date_filed'].astype('datetime64[ns]').dt.year == 2023
subset_df = df[condition_subset].head(10)
# Apply the process_row function to the subset of rows and store the result in a new column
subset_df['raw_filing'] = subset_df['file_name'].apply(lambda url: d.fetch_text_from_url(url, agent = AGENT))


In [9]:
# Update the original DataFrame with the results from the subset
df.update(subset_df)
u.load_data_to_bq(data = df, credentials = SERVICE_ACCOUNT, gcp_project = PROJECT, bq_dataset = DATASET_ID, table = FILINGS_10KQ_TABLE_ID, truncate = True)
subset_df.to_pickle('data.pkl')

0    <SEC-DOCUMENT>0001564590-17-005503.txt : 20170...
1    <SEC-DOCUMENT>0001564590-19-010132.txt : 20190...
2    <SEC-DOCUMENT>0001558370-20-003383.txt : 20200...
3    <SEC-DOCUMENT>0001558370-23-004942.txt : 20230...
4    <SEC-DOCUMENT>0001558370-22-004753.txt : 20220...
5    <SEC-DOCUMENT>0001558370-21-003728.txt : 20210...
6    <SEC-DOCUMENT>0001564590-18-007158.txt : 20180...
7    -----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...
8    -----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...
9    -----BEGIN PRIVACY-ENHANCED MESSAGE-----\nProc...
Name: raw_filing, dtype: object