In [3]:
import os
import logging
import time
import pandas as pd
import argparse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [4]:
driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.sec.gov/edgar/browse/?CIK=1278752'
driver.get(url)
html_content = driver.page_source
if not os.path.exists('htmls'):
    os.mkdir('htmls')
with open(os.path.join('htmls', url.split("=")[-1]+".html"), "w", encoding='utf-8') as file:
    file.write(html_content)
dfs = pd.read_html(html_content)

if not os.path.exists('csv'):
    os.mkdir('csv')
for i, df in enumerate(dfs):
    df.to_csv(os.path.join('csv', url.split("=")[-1]+f"_link_table_{i}.csv"))

h5_tags = driver.find_elements_by_tag_name("h5")

for h5_tag in h5_tags:
    if h5_tag.text == "[+] 10-K (annual reports) and 10-Q (quarterly reports)":
        # Click on the h5 tag.
        h5_tag.click()
        break


xpath = '//button[text()="View all 10-Ks and 10-Qs"]'
element = WebDriverWait(driver, 3).until(
    EC.element_to_be_clickable((By.XPATH, xpath)))
driver.execute_script("arguments[0].click();", element)

conditions = '@data-original-title="Open document" and contains(@href, "Archive") and not(contains(@href, "index")) and not(contains(@href, "xml"))'
table = driver.find_elements_by_css_selector('div.dataTables_scroll')
links = table[0].find_elements_by_xpath(f'//td//a[{conditions}]')


logging.debug(
    f"LINKS - {len([link.get_attribute('innerHTML') for link in links])}")
df = pd.read_html(table[0].get_attribute('innerHTML'))[-1]
filing_date = df['Reporting date']


logging.debug(f"DATES - {len(filing_date)}")

with open(os.path.join('urls', url.split("=")[-1]+".txt"), 'w') as url_out:
    for a, date in zip(links, filing_date):
        url_out.write('\n%s %s' %
                      (date.split("View")[0], a.get_attribute('href')))
        logging.debug('\n%s %s' %
                      (date.split("View")[0], a.get_attribute('href')))
driver.close()

In [127]:
table = pd.read_csv('../../../../Downloads/EDGAR Entity Landing Page.csv')
table['Reporting date'] = pd.to_datetime(
    table['Reporting date'], format='%Y-%m-%d')
table = table[table['Reporting date'] > '2013-01-01']

In [128]:
table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, 0 to 46
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Form type         47 non-null     object        
 1   Form description  47 non-null     object        
 2   Filing date       47 non-null     object        
 3   Reporting date    47 non-null     datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 1.8+ KB


In [129]:
url_table = pd.read_csv('../Code/urls/1278752.txt',
                        names=['Reporting date', 'url'], delim_whitespace=True)

In [130]:
url_table['Reporting date'] = pd.to_datetime(
    url_table['Reporting date'], format='%Y-%m-%d')
url_table = url_table[url_table['Reporting date'] > '2013-01-01']

In [131]:
table = table.merge(url_table, on='Reporting date')

In [132]:
table.to_csv('test.csv')

In [133]:
table = table.drop(table[table['Form description'].str.contains(
    'amendment', case=False)].index).reset_index(drop=True)
table['Reporting date'] = table['Reporting date'].astype(str)

In [134]:
table.to_excel('MFIC__sec_filing_links.xlsx')