In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import re
import pandas as pd
import openpyxl

In [2]:
#Función que genera el dataframe
def create_df(l_col, l_per, l_val, company):
    df = pd.DataFrame([], columns=l_col)  
    for i in range(len(l_per)):
        list = [l_per[i]]
        pointer=i
        for j in range(len(l_col)-1):
            list.append(l_val[pointer].text)
            pointer = pointer + len(l_per)
        df.loc[len(df)] = list
    df.insert(0, "Company", company) 
    return df

#Función para obtener las diferentes columnas que formarán parte del dataframe
def get_columns(yahoo_html_table):
    name_format = "^[A-Z]"
    list_col = []
    start_scraping = False
    
    for index, val in enumerate(yahoo_html_table):
        if (val.text == "Breakdown"): start_scraping = True
        if (val.text == "People Also Watch" or val.text=="Data Disclaimer"): start_scraping = False
        if (start_scraping == True): 
            if (re.match(name_format, val.text)):
                list_col.append(val.text)
    return list_col

#Función para extraer el número de periodos para los cuales disponemos de datos para una compañía
def get_periods(yahoo_html_table):
    list_per = []
    start_scraping = False
    regexDate = re.compile('[@_!#?/\|}{~:]')
    for index, val in enumerate(yahoo_html_table):
        if (val.text == "Breakdown"): start_scraping = True
        if (val.text == "People Also Watch"): start_scraping = False
        if (start_scraping == True): 
            if (val.text == "ttm" or regexDate.search(val.text) != None):
                list_per.append(val.text)
    return list_per

In [10]:
options = Options()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
driver = webdriver.Firefox(executable_path=r'C:\Users\dpriego\Anaconda3\geckodriver.exe', options=options)
#driver = webdriver.Firefox()

wait_sec = 20
url_income_statement = 'https://finance.yahoo.com'
driver.get(url_income_statement)
driver.maximize_window()

#Obtener user agent
user_agent = driver.execute_script("return navigator.userAgent;")
print("User agent:", user_agent)

#Click boton aceptar coockies
element = driver.find_element_by_xpath("/html/body/div/div/div/div/form/div[2]/div[2]/button[1]")
driver.execute_script("arguments[0].click();", element)

#Click boton Industries
driver.implicitly_wait(wait_sec)
driver.find_element_by_xpath("//a[@title='Industries']").click()

#Click boton acciones de empresas tecnológicas
driver.implicitly_wait(wait_sec)
driver.find_element_by_xpath("//a[@title='Technology']").click()

#Obtenemos el listado de las distintas compañías
driver.implicitly_wait(wait_sec)
html_tech_sheet = BeautifulSoup(driver.page_source, 'html.parser')
element = driver.find_element_by_xpath("/html/body/div[1]/div/div/div[1]/div/div[2]/div/div/div[6]/div/div/section/div/div[2]/div[2]/button[3]")
rows = []
header = []
salir = False
table = html_tech_sheet.find('table')
while not salir:  
    for i, row in enumerate(table.find_all('tr')):
        if i == 0:
            header = [el.text.strip() for el in row.find_all('th')]
        else:
            rows.append([el.text.strip() for el in row.find_all('td')])
    driver.implicitly_wait(wait_sec)
    element = driver.find_element_by_xpath("/html/body/div[1]/div/div/div[1]/div/div[2]/div/div/div[6]/div/div/section/div/div[2]/div[2]/button[3]")    
    if not element.is_enabled():
        salir = True
    driver.find_element_by_xpath("/html/body/div[1]/div/div/div[1]/div/div[2]/div/div/div[6]/div/div/section/div/div[2]/div[2]/button[3]").click()
    html_tech_sheet = BeautifulSoup(driver.page_source, 'html.parser')
    table = html_tech_sheet.find('table')

#Extraemos el identificador y nombre de cada empresa en un diccionaro para realizar el scraping de cada una de ellas:
companys = {}
for c in rows:
    if c[0] not in companys.keys():
        companys[c[0]] = c[1]
dfIncome = pd.DataFrame()
dfBalance = pd.DataFrame()
#Obtener los datos de cada compañía
for key in list(companys.keys())[0:20]:
    ticker = key
    url_income_statement = 'https://finance.yahoo.com/quote/'+ticker+'/financials?p='+ticker
    driver.get(url_income_statement)

    #Click boton Expandir datos   
    WebDriverWait(driver, wait_sec).until(EC.element_to_be_clickable((By.XPATH,
    "/html/body/div[1]/div/div/div[1]/div/div[3]/div[1]/div/div[2]/div/div/section/div[2]/button/div/span"))).click()   
    driver.implicitly_wait(wait_sec)
    html_income_statement = BeautifulSoup(driver.page_source, 'html.parser')
    
    #Datos income_statement
    income_statement_val = html_income_statement.findAll('div', attrs={"data-test":"fin-col"})
    income_statement_col = html_income_statement.findAll('span')
    df=create_df(get_columns(income_statement_col), get_periods(income_statement_col), income_statement_val, companys[key])
    dfIncome = pd.concat([dfIncome, df])
    
    #Navegamos a balance-sheet
    WebDriverWait(driver, wait_sec).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 
    "a[href*='balance-sheet']"))).click()
    
    #Click boton Expandir datos
    driver.implicitly_wait(wait_sec)
    WebDriverWait(driver, wait_sec).until(EC.element_to_be_clickable((By.XPATH,
    "/html/body/div[1]/div/div/div[1]/div/div[3]/div[1]/div/div[2]/section/div[2]/button/div/span"))).click()
    html_balance_sheet = BeautifulSoup(driver.page_source, 'html.parser')
    
    balance_sheet_val = html_balance_sheet.findAll('div', attrs={"data-test":"fin-col"})
    balance_sheet_col = html_balance_sheet.findAll('span')
    df=create_df(get_columns(balance_sheet_col), get_periods(balance_sheet_col), balance_sheet_val, companys[key])
    dfBalance = pd.concat([dfBalance, df])

dfIncome.to_csv("Income.csv", index=False)
dfBalance.to_csv("Balance.csv", index=False)
driver.quit()

User agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0
