In [349]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime

### Obtener Datos Financieros

In [350]:
def get_profile(ticker):
    """Obtiene y analiza la información del perfil de una acción dada desde Yahoo Finance.
    """
    # Construir la URL de la página de la declaración de ingresos
    url= f"https://finance.yahoo.com/quote/{ticker}/profile"

    # Definir encabezados para imitar una solicitud de navegador
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    # Realizar solicitud GET a la URL
    response = requests.get(url, headers=headers)

    # Analizar el contenido HTML usando BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extraer nombre, sector, industria y cantidad de empleados
    find1 = [link.get_text().strip() for link in soup.find_all('header', class_="medium mb-4 yf-1trny4b font-condensed")][1]
    find2 = [link.get_text().strip() for link in soup.find_all('a', class_="subtle-link fin-size-large yf-13p9sh2")]

    # Combinar sublistas
    combined = [find1] + find2

    #Definir titulos de las columnas para hacer el DataFrame
    titles = ['Name','Sector','Industry']

    # Crear un DataFrame con los datos extraídos
    df = pd.DataFrame([combined], columns = titles)
    df['Ticker'] = ticker

    
    return df


In [351]:
def get_income_statement(ticker):
    """Obtiene los datos del estado de resultados del símbolo (ticker) de una acción extraída de Yahoo Finance.
    """
    # Construir la URL de la página de la declaración de ingresos
    url= f"https://finance.yahoo.com/quote/{ticker}/financials"

    # Definir encabezados para imitar una solicitud de navegador
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    # Realizar solicitud GET a la URL
    response = requests.get(url, headers=headers)

    # Analizar el contenido HTML usando BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extraer encabezados de columnas
    find1 = soup.find_all('div', class_="column yf-1ezv2n5 alt")
    find2 = soup.find_all('div', class_="column yf-1ezv2n5")
    group = [x.text.strip() for x in find1 + find2]

    # Ordenar los encabezados según el orden especificado
    if len(group) == 6:
        order = [0, 3, 1, 4, 2, 5]
    if len(group) == 5:
        order = [0, 3, 1, 4, 2]

    titles = [group[i] for i in order]

    # Extraer datos de encabezados
    header = [link.get_text().strip() for link in soup.find_all('div', class_="column sticky yf-1xjz32c")]

    # Extraer datos de columnas
    row1 = [link.get_text().strip() for link in soup.find_all('div', class_="column yf-1xjz32c alt")]
    row2 = [link.get_text().strip() for link in soup.find_all('div', class_="column yf-1xjz32c")]

    # Agrupar los elementos en sublistas
    sub_listas_a = [row1[i:i+3] for i in range(0, len(row1), 3)]
    sub_listas_b = [row2[i:i+(3 if len(titles) == 6 else 2)] for i in range(0, len(row2), (3 if len(titles) == 6 else 2))]
    # Combinar sublistas
    combined = [sublist_a + sublist_b for sublist_a, sublist_b in zip(sub_listas_a, sub_listas_b)]
    # Ordenar según el orden especificado y convertir a float
    data = [[float(sublist[i].replace(",", "")) if sublist[i] != "--" else 0 for i in order] for sublist in combined]

    # Crear el diccionario
    dic = dict(zip(header, data))
    df = pd.DataFrame(dic)
    df.index = titles
    return df


In [352]:
def get_balance_sheet(ticker):
    """Obtiene los datos del balance del símbolo (ticker) de una acción extraída de Yahoo Finance.
    """
    # Construir la URL de la página de la declaración de ingresos
    url= f"https://finance.yahoo.com/quote/{ticker}/balance-sheet"

    # Definir encabezados para imitar una solicitud de navegador
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    # Realizar solicitud GET a la URL
    response = requests.get(url, headers=headers)

    # Analizar el contenido HTML usando BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extraer encabezados de columnas
    find1 = soup.find_all('div', class_="column yf-1ezv2n5 alt")
    find2 = soup.find_all('div', class_="column yf-1ezv2n5")
    group = [x.text.strip() for x in find1 + find2]

    # Ordenar los encabezados según el orden especificado y por cantidad de columnas
    if len(group) == 5:
        order = [0, 3, 1, 4, 2]
    if len(group) == 4:
        order = [0, 2, 1, 3]
    titles = [group[i] for i in order]

    # Extraer datos de encabezados
    header = [link.get_text().strip() for link in soup.find_all('div', class_="column sticky yf-1xjz32c")]

    # Extraer datos de columnas
    row1 = [link.get_text().strip() for link in soup.find_all('div', class_="column yf-1xjz32c alt")]
    row2 = [link.get_text().strip() for link in soup.find_all('div', class_="column yf-1xjz32c")]



    # Agrupar los elementos en sublistas
    sub_listas_a = [row1[i:i+(3 if len(titles) == 5 else 2)] for i in range(0, len(row1), (3 if len(titles) == 5 else 2))]
    sub_listas_b = [row2[i:i+2] for i in range(0, len(row2), 2)]

    # Combinar sublistas
    combined = [sublist_a + sublist_b for sublist_a, sublist_b in zip(sub_listas_a, sub_listas_b)]

    # Ordenar según el orden especificado y convertir a float
    data = [[float(sublist[i].replace(",", "")) if sublist[i] != "--" else 0 for i in order] for sublist in combined]

    # Crear el diccionario
    dic = dict(zip(header, data))
    df = pd.DataFrame(dic)
    df.index = titles
    return df


In [353]:
def get_cash_flow(ticker):
    """Obtiene los datos del flujo de caja del símbolo (ticker) de una acción extraída de Yahoo Finance.
    """
    # Construir la URL de la página de la declaración de ingresos
    url= f"https://finance.yahoo.com/quote/{ticker}/cash-flow"

    # Definir encabezados para imitar una solicitud de navegador
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}

    # Realizar solicitud GET a la URL
    response = requests.get(url, headers=headers)

    # Analizar el contenido HTML usando BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extraer encabezados de columnas
    find1 = soup.find_all('div', class_="column yf-1ezv2n5 alt")
    find2 = soup.find_all('div', class_="column yf-1ezv2n5")
    group = [x.text.strip() for x in find1 + find2]

    # Ordenar los encabezados según el orden especificado
    if len(group) == 6:
        order = [0, 3, 1, 4, 2, 5]
    if len(group) == 5:
        order = [0, 3, 1, 4, 2]

    titles = [group[i] for i in order]

    # Extraer datos de encabezados
    header = [link.get_text().strip() for link in soup.find_all('div', class_="column sticky yf-1xjz32c")]

    # Extraer datos de columnas
    row1 = [link.get_text().strip() for link in soup.find_all('div', class_="column yf-1xjz32c alt")]
    row2 = [link.get_text().strip() for link in soup.find_all('div', class_="column yf-1xjz32c")]

    # Agrupar los elementos en sublistas
    sub_listas_a = [row1[i:i+3] for i in range(0, len(row1), 3)]
    sub_listas_b = [row2[i:i+(3 if len(titles) == 6 else 2)] for i in range(0, len(row2), (3 if len(titles) == 6 else 2))]

    # Combinar sublistas
    combined = [sublist_a + sublist_b for sublist_a, sublist_b in zip(sub_listas_a, sub_listas_b)]

    # Ordenar según el orden especificado y convertir a float
    data = [[float(sublist[i].replace(",", "")) if sublist[i] != "--" else 0 for i in order] for sublist in combined]

    # Crear el diccionario
    dic = dict(zip(header, data))
    df = pd.DataFrame(dic)
    df.index = titles
    return df


In [354]:
def get_price(ticker):
    """Obtiene los datos históricos de precios de una acción desde Yahoo Finance.
    """
    # Obtener los datos históricos de precios de la acción
    df = yf.Ticker(ticker).history(start='2020-09-30', end=datetime.now().strftime('%Y-%m-%d'))
    
    # Seleccionar la columna de precios y reiniciar el índice
    df = df.iloc[:, 0].reset_index()
    
    # Formatear la columna de fechas y renombrar la columna de precios
    df["Date"] = df["Date"].dt.strftime('%d/%m/%Y')
    df.rename(columns={"Open": "Price"}, inplace=True)
    
    # Agregar la columna de símbolo de la acción
    df["Ticker"] = ticker
    
    return df


In [355]:
def selections(ticker):
    # Obtener los estados financieros
    statement = get_income_statement(ticker)  # Estado de ingresos
    balance = get_balance_sheet(ticker)     # Balance general
    cashflow = get_cash_flow(ticker)         # Flujo de efectivo
        
    # Concatenar las filas seleccionadas
    select = pd.concat([statement, balance, cashflow], axis=1).iloc[1:].reset_index()
    
    # Agregar una columna con el ticker de la acción
    select['Ticker'] = ticker

    # Renombrar columnas para mayor claridad
    select = select.rename(columns={"index": "Date"})
    
    return select

In [356]:
# Símbolos de las acciones a procesar
list_stock = [
'AAPL','QCOM','AMD','NVDA','SHOP','CRM','MSFT','ZM','UBER','ADBE','DBX','ORCL','ACN','GLOB',
'ABNB','BKNG','LULU','MBUU','DOOO','PII','NKE','ADS.DE','AZO','BBY','CPRI','MC.PA','SIG',
'GRBK','LEN','RACE','TSLA','AMZN','MELI','SBUX','MCD','SHAK','DPZ','PZZA',
'V','MA','PYPL','AXP',
'KO','PEP','MNST','WMT','COST','TGT','SFM','KR','DNUT',
'NFLX','DIS','META','GOOGL','SPOT','DASH','MTCH','RBLX','EA']

### Procesar Precios de Acciones

In [357]:
# Definir una función para procesar el DataFrame de precios de acciones
def process_stock_prices(dataframe):
    dataframe['Date'] = pd.to_datetime(dataframe['Date'], format='%d/%m/%Y')
    dataframe['Price'] = dataframe['Price'].round(2)
    return dataframe

# Dataframe de los precios de las acciones
historical = [get_price(symbol) for symbol in list_stock]
stocks = pd.concat(historical, ignore_index=True)

# Procesar el DataFrame de precios de acciones
stocks = process_stock_prices(stocks)

In [358]:
stocks

Unnamed: 0,Date,Price,Ticker
0,2020-09-30,111.20,AAPL
1,2020-10-01,114.96,AAPL
2,2020-10-02,110.32,AAPL
3,2020-10-05,111.32,AAPL
4,2020-10-06,113.07,AAPL
...,...,...,...
58194,2024-08-12,146.11,EA
58195,2024-08-13,146.35,EA
58196,2024-08-14,146.38,EA
58197,2024-08-15,147.86,EA


In [359]:
stocks.to_csv("../Scraping/Data/stocks.csv",index = False)

### Extracción del Perfil de Acciones

In [360]:
# Dataframe de información de las acciones
info = [get_profile(symbol) for symbol in list_stock]
profile = pd.concat(info, ignore_index=True)

In [361]:
profile

Unnamed: 0,Name,Sector,Industry,Ticker
0,Apple Inc.,Technology,Consumer Electronics,AAPL
1,QUALCOMM Incorporated,Technology,Semiconductors,QCOM
2,"Advanced Micro Devices, Inc.",Technology,Semiconductors,AMD
3,NVIDIA Corporation,Technology,Semiconductors,NVDA
4,Shopify Inc.,Technology,Software - Application,SHOP
5,"Salesforce, Inc.",Technology,Software - Application,CRM
6,Microsoft Corporation,Technology,Software - Infrastructure,MSFT
7,"Zoom Video Communications, Inc.",Technology,Software - Application,ZM
8,"Uber Technologies, Inc.",Technology,Software - Application,UBER
9,Adobe Inc.,Technology,Software - Infrastructure,ADBE


In [362]:
profile.to_csv("../Scraping/Data/profile.csv",index = False)

### Extracción de Datos Financieros

In [363]:
# Definir una función para convertir columnas a datetime y llenar NaN con 0
def process_financials(dataframe):
    dataframe['Date'] = pd.to_datetime(dataframe['Date'], format='%m/%d/%Y')
    dataframe.fillna(0, inplace=True)
    return dataframe

# Dataframe de los elementos financieros seleccionados 
bucle = [selections(symbol) for symbol in list_stock]
financials = pd.concat(bucle, ignore_index=True)

# Procesar el DataFrame de financials
financials = process_financials(financials)

In [364]:
import datetime

# Función para verificar si una fecha cae en fin de semana
def es_fin_de_semana(fecha):
    dia_semana = fecha.weekday()
    return dia_semana >= 5

# Función para ajustar las fechas que caen en fin de semana
def ajustar_fecha(df):
    # Iterar sobre las filas del DataFrame
    for index, row in df.iterrows():
        fecha = row['Date']
        # Verificar si la fecha cae en fin de semana
        if es_fin_de_semana(fecha):
            # Sumar un día si es sábado (5)
            if fecha.weekday() == 5:
                fecha -= datetime.timedelta(days=1)
            # Sumar un día si es domingo (6)
            elif fecha.weekday() == 6:
                fecha -= datetime.timedelta(days=2)
            # Actualizar la columna 'Date' con la nueva fecha
            df.at[index, 'Date'] = fecha
    return df

# Aplicar la función de ajuste de fecha a toda la columna 'Date'
financials = ajustar_fecha(financials)

In [365]:
financials

Unnamed: 0,Date,Total Revenue,Cost of Revenue,Gross Profit,Operating Expense,Operating Income,Net Non Operating Interest Income Expense,Other Income Expense,Pretax Income,Tax Provision,...,Treasury Shares Number,Rent Expense Supplemental,Preferred Stock Equity,Preferred Shares Number,Credit Losses Provision,Non Interest Expense,Special Income Charges,Interest Income after Provision for Loan Loss,Total Money Market Investments,Cash Flow from Discontinued Operation
0,2023-09-29,383285000.0,214137000.0,169148000.0,54847000.0,114301000.0,-183000.0,-382000.0,113736000.0,16741000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-09-30,394328000.0,223546000.0,170782000.0,51345000.0,119437000.0,-106000.0,-228000.0,119103000.0,19300000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2021-09-30,365817000.0,212981000.0,152836000.0,43887000.0,108949000.0,198000.0,60000.0,109207000.0,14527000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-09-30,274515000.0,169559000.0,104956000.0,38668000.0,66288000.0,890000.0,-87000.0,67091000.0,9680000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2019-09-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,2024-03-29,7562000.0,1710000.0,5852000.0,4272000.0,1580000.0,68000.0,-59000.0,1589000.0,316000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
295,2023-03-31,7426000.0,1792000.0,5634000.0,4191000.0,1443000.0,-9000.0,-108000.0,1326000.0,524000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
296,2022-03-31,6991000.0,1859000.0,5132000.0,4003000.0,1129000.0,-54000.0,6000.0,1081000.0,292000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,2021-03-31,5629000.0,1494000.0,4135000.0,3089000.0,1046000.0,-21000.0,-8000.0,1017000.0,180000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [366]:
financials.to_csv("../Scraping/Data/financials.csv",index = False)