### GET LINKS

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import json
from tqdm import tqdm

In [None]:
def setup_driver():
    options = Options()
    # options.headless = True  # Uncomment to run the browser in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

In [None]:
def read_ticker_file(file_path):
    companies = []
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines:
            ticker, cik = line.strip().split()
            companies.append((ticker, cik))
    return companies

In [None]:
company_url = "https://www.sec.gov/edgar/browse/?CIK=715957&owner=exclude"
driver = setup_driver()

In [None]:
driver.get(company_url)

In [None]:
try:
    # Locate the button
    button = driver.find_element(By.XPATH, "//button[contains(text(),'View all 10-Ks and 10-Qs')]")
    # Click using JavaScript
    driver.execute_script("arguments[0].click();", button)
    print("Button clicked successfully.")
except NoSuchElementException:
    print("Button not found.")

In [None]:
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.XPATH, "//table")))

In [None]:
rows = driver.find_elements(By.XPATH, "//table/tbody/tr")

In [None]:
data = []

In [None]:
try:
    # Wait for the table to be present
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.XPATH, "//table")))

    # Locate all rows in the table
    rows = driver.find_elements(By.XPATH, "//table/tbody/tr")

    for row in rows:
        try:
            # Extract the type of document
            document_type = row.find_element(By.XPATH, ".//td[1]").text
            
            # Extract the document link
            document_link = row.find_element(By.XPATH, ".//td/div/a[@class='document-link']").get_attribute("href")
            
            # Extract the filing date
            filing_date = row.find_element(By.XPATH, ".//td[3]").text  # Adjust the index based on the actual filing date column
            
            # Extract the reporting date
            reporting_date = row.find_element(By.XPATH, ".//td[4]").text  # Adjust the index based on the actual reporting date column

            data.append({
                "Type of Document": document_type,
                "Link": document_link,
                "Filing Date": filing_date,
                "Reporting Date": reporting_date
            })

        except NoSuchElementException:
            continue  # If any element is not found in the current row, skip to the next one

except (NoSuchElementException, TimeoutException):
    print("Table not found or could not load the page.")

In [None]:
print(data)

In [None]:
def scrape_company_data(driver, cik):
    company_url = f"https://www.sec.gov/edgar/browse/?CIK={cik}&owner=exclude"
    driver.get(company_url)
    data = []

    try:
        # Locate and click the button to view all 10-Ks and 10-Qs
        button = driver.find_element(By.XPATH, "//button[contains(text(),'View all 10-Ks and 10-Qs')]")
        driver.execute_script("arguments[0].click();", button)
        print(f"Button clicked successfully for CIK: {cik}.")
        
        # Wait for the table to be present
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.XPATH, "//table")))

        # Locate all rows in the table
        rows = driver.find_elements(By.XPATH, "//table/tbody/tr")

        for row in rows:
            try:
                # Extract the type of document
                document_type = row.find_element(By.XPATH, ".//td[1]").text
                
                # Extract the document link
                document_link = row.find_element(By.XPATH, ".//td/div/a[@class='document-link']").get_attribute("href")
                
                # Extract the filing date
                filing_date = row.find_element(By.XPATH, ".//td[3]").text  # Adjust the index based on the actual filing date column
                
                # Extract the reporting date
                reporting_date = row.find_element(By.XPATH, ".//td[4]").text  # Adjust the index based on the actual reporting date column

                data.append({
                    "Type of Document": document_type,
                    "Link": document_link,
                    "Filing Date": filing_date,
                    "Reporting Date": reporting_date
                })

            except NoSuchElementException:
                continue  # If any element is not found in the current row, skip to the next one

    except (NoSuchElementException, TimeoutException):
        print(f"Table not found or could not load the page for CIK: {cik}.")

    return data

In [None]:
def get_documents_links():
    companies = read_ticker_file('ticker.txt')
    driver = setup_driver()
    
    all_data = {}
    count =0

    for ticker, cik in tqdm(companies):
        print(f"Processing {ticker} with CIK: {cik}")
        company_data = scrape_company_data(driver, cik)
        all_data[ticker]=company_data
        print(company_data)
        if count==10:
            break
        count+=1  

    driver.quit()
    return all_data

In [None]:
all_data=get_documents_links()

In [None]:
with open('company_data.txt', 'w') as outfile:
    json.dump(all_data, outfile, indent=4)

### GET HTMLs


In [None]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import time

In [None]:
def setup_driver():
    options = Options()
    # options.headless = True  # Uncomment to run the browser in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def read_company_data(file_path):
    with open(file_path, 'r') as infile:
        all_data = json.load(infile)
    return all_data

In [None]:
def save_html(driver, url, company, doc_type, reporting_date):
    driver.get(url)
    time.sleep(6)
    try:
        # Wait for the iframe to be present
        wait = WebDriverWait(driver, 30)
        iframe_present = wait.until(EC.presence_of_element_located((By.ID, "ixvFrame")))

        if iframe_present:
            # Switch to the iframe
            driver.switch_to.frame(iframe_present)

            # Wait for the iframe content to load completely
            wait.until(EC.presence_of_element_located((By.XPATH, "//body")))

            # Extract the HTML content
            html_content = driver.page_source

            # Switch back to the main content
            driver.switch_to.default_content()

        else:
            # If iframe is not present, get the page source
            html_content = driver.page_source

        # Create the directory if it doesn't exist
        if not os.path.exists('html_files'):
            os.makedirs('html_files')

        # Save the HTML content to a file
        file_name = f"html_files/{company}_{doc_type}_{reporting_date}.html"
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(html_content)
        
        print(f"Saved {file_name} successfully.")

    except TimeoutException:
        print(f"iframe not found for URL: {url}")
        html_content = driver.page_source
        # Create the directory if it doesn't exist
        if not os.path.exists('html_files'):
            os.makedirs('html_files')

        # Save the HTML content to a file
        file_name = f"html_files/{company}_{doc_type}_{reporting_date}.html"
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(html_content)
        
        print(f"Saved {file_name} successfully.")
        
    except NoSuchElementException:
        print(f"Element not found for URL: {url}")
        html_content = driver.page_source
        # Create the directory if it doesn't exist
        if not os.path.exists('html_files'):
            os.makedirs('html_files')

        # Save the HTML content to a file
        file_name = f"html_files/{company}_{doc_type}_{reporting_date}.html"
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(html_content)
        
        print(f"Saved {file_name} successfully.")


In [None]:
def main():
    company_data = read_company_data('company_data.txt')
    driver = setup_driver()

    for company, documents in tqdm(company_data.items()):
        print(f"Processing company: {company}")
        for doc in documents:
            url = doc['Link']
            doc_type = doc['Type of Document']
            reporting_date = doc['Reporting Date']
            print(f"Fetching data for {company}: {doc_type} on {reporting_date}")
            save_html(driver, url, company, doc_type, reporting_date)

    driver.quit()

In [None]:
if __name__ == "__main__":
    main()