In [9]:
import os
import logging
import time
import pandas as pd
import argparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [11]:
#
CIK = '1572694'
driver = webdriver.Chrome(ChromeDriverManager().install())
url = f'https://www.sec.gov/edgar/browse/?CIK={CIK}'
driver.get(url)
html_content = driver.page_source
if not os.path.exists('htmls'):
    os.mkdir('htmls')
with open(os.path.join('htmls', url.split("=")[-1]+".html"), "w", encoding='utf-8') as file:
    file.write(html_content)
dfs = pd.read_html(html_content)

if not os.path.exists('csv'):
    os.mkdir('csv')
for i, df in enumerate(dfs):
    df.to_csv(os.path.join('csv', url.split("=")[-1]+f"_link_table_{i}.csv"))

h5_tags = driver.find_elements_by_tag_name("h5")

for h5_tag in h5_tags:
    if h5_tag.text == "[+] 10-K (annual reports) and 10-Q (quarterly reports)":
        # Click on the h5 tag.
        h5_tag.click()
        break


xpath = '//button[text()="View all 10-Ks and 10-Qs"]'
element = WebDriverWait(driver, 3).until(
    EC.element_to_be_clickable((By.XPATH, xpath)))
driver.execute_script("arguments[0].click();", element)

conditions = '@data-original-title="Open document" and contains(@href, "Archive") and not(contains(@href, "index")) and not(contains(@href, "xml"))'
table = driver.find_elements_by_css_selector('div.dataTables_scroll')
links = table[0].find_elements_by_xpath(f'//td//a[{conditions}]')


logging.debug(
    f"LINKS - {len([link.get_attribute('innerHTML') for link in links])}")
df = pd.read_html(table[0].get_attribute('innerHTML'))[-1]
filing_date = df['Reporting date']


logging.debug(f"DATES - {len(filing_date)}")

if not os.path.exists('urls'):
    os.mkdir('urls')

with open(os.path.join('urls', url.split("=")[-1]+".txt"), 'w') as url_out:
    for a, date in zip(links, filing_date):
        url_out.write('\n%s %s' %
                      (date.split("View")[0], a.get_attribute('href')))
        logging.debug('\n%s %s' %
                      (date.split("View")[0], a.get_attribute('href')))
driver.close()

In [29]:
table = pd.read_csv('../../../../../Downloads/EDGAR Entity Landing Page.csv')
table['Reporting date'] = pd.to_datetime(
    table['Reporting date'], format='%Y-%m-%d')
table = table[table['Reporting date'] > '2013-01-01']

In [30]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Form type         37 non-null     object        
 1   Form description  37 non-null     object        
 2   Filing date       37 non-null     object        
 3   Reporting date    37 non-null     datetime64[ns]
 4   Film number(s)    37 non-null     int64         
 5   File number(s)    0 non-null      float64       
 6   Accession number  37 non-null     object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 2.1+ KB


In [26]:
url_table = pd.read_csv(f'../Extract_links/urls/{CIK}.txt',
                        names=['Reporting date', 'url'], delim_whitespace=True)

In [27]:
url_table

Unnamed: 0,Reporting date,url
0,2023-09-30,https://www.sec.gov/ix?doc=/Archives/edgar/dat...
1,2023-06-30,https://www.sec.gov/ix?doc=/Archives/edgar/dat...
2,2023-03-31,https://www.sec.gov/ix?doc=/Archives/edgar/dat...
3,2022-12-31,https://www.sec.gov/ix?doc=/Archives/edgar/dat...
4,2022-09-30,https://www.sec.gov/ix?doc=/Archives/edgar/dat...
5,2022-06-30,https://www.sec.gov/ix?doc=/Archives/edgar/dat...
6,2022-06-30,https://www.sec.gov/Archives/edgar/data/000157...
7,2022-03-31,https://www.sec.gov/Archives/edgar/data/000157...
8,2021-12-31,https://www.sec.gov/Archives/edgar/data/000157...
9,2021-09-30,https://www.sec.gov/Archives/edgar/data/000157...


In [34]:
url_table['Reporting date'] = pd.to_datetime(
    url_table['Reporting date'], format='%Y-%m-%d')
table['Reporting date'] = pd.to_datetime(
    table['Reporting date'], format='%Y-%m-%d')
# url_table = url_table[url_table['Reporting date'] > '2013-01-01']

In [54]:
zipped_dates = zip(url_table['Reporting date'], table['Reporting date'])

for date_pair in zipped_dates:
    print(date_pair)

(Timestamp('2023-09-30 00:00:00'), '2023-09-30')
(Timestamp('2023-06-30 00:00:00'), '2023-06-30')
(Timestamp('2023-03-31 00:00:00'), '2023-03-31')
(Timestamp('2022-12-31 00:00:00'), '2022-12-31')
(Timestamp('2022-09-30 00:00:00'), '2022-09-30')
(Timestamp('2022-06-30 00:00:00'), '2022-06-30')
(Timestamp('2022-06-30 00:00:00'), '2022-06-30')
(Timestamp('2022-03-31 00:00:00'), '2022-03-31')
(Timestamp('2021-12-31 00:00:00'), '2021-12-31')
(Timestamp('2021-09-30 00:00:00'), '2021-09-30')
(Timestamp('2021-06-30 00:00:00'), '2021-06-30')
(Timestamp('2021-03-31 00:00:00'), '2021-03-31')
(Timestamp('2020-12-31 00:00:00'), '2020-12-31')
(Timestamp('2020-09-30 00:00:00'), '2020-09-30')
(Timestamp('2020-06-30 00:00:00'), '2020-06-30')
(Timestamp('2020-03-31 00:00:00'), '2020-03-31')
(Timestamp('2019-12-31 00:00:00'), '2019-12-31')
(Timestamp('2019-09-30 00:00:00'), '2019-09-30')
(Timestamp('2019-06-30 00:00:00'), '2019-06-30')
(Timestamp('2019-03-31 00:00:00'), '2019-03-31')
(Timestamp('2018-12-

In [40]:
table = table.merge(url_table, on='Reporting date')

In [47]:
table = table.drop(table[table['Form description'].str.contains(
    'amendment', case=False)].index).reset_index(drop=True)
table['Reporting date'] = table['Reporting date'].astype(str)

In [48]:
table.to_csv('test.csv')

In [49]:
table.to_excel('../../GSBC_sec_filing_links.xlsx')