In [38]:
from datetime import datetime
import requests
import numpy as np
import pandas as pd

In [39]:
def get_page(url):
    # Set up the request headers that we're going to use, to simulate
    # a request by the Chrome browser. Simulating a request from a browser
    # is generally good practice when building a scraper
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'close',
        'DNT': '1', # Do Not Track Request Header 
        'Pragma': 'no-cache',
        'Referrer': 'https://google.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
    }

    return requests.get(url, headers=headers)

def parse_rows(table_rows):
    parsed_rows = []

    for table_row in table_rows:
        parsed_row = []
        el = table_row.xpath("./div")

        none_count = 0

        for rs in el:
            try:
                (text,) = rs.xpath('.//span/text()[1]')
                parsed_row.append(text)
            except ValueError:
                parsed_row.append(np.NaN)
                none_count += 1

        if (none_count < 4):
            parsed_rows.append(parsed_row)
            
    return pd.DataFrame(parsed_rows)

def clean_data(df):
    df = df.set_index(0) # Set the index to the first column: 'Period Ending'.
    df = df.transpose() # Transpose the DataFrame, so that our header contains the account names
    
    # Rename the "Breakdown" column to "Date"
    cols = list(df.columns)
    cols[0] = 'Date'
    df = df.set_axis(cols, axis='columns', inplace=False)
    
    numeric_columns = list(df.columns)[1::] # Take all columns, except the first (which is the 'Date' column)

    for column_index in range(1, len(df.columns)): # Take all columns, except the first (which is the 'Date' column)
        df.iloc[:,column_index] = df.iloc[:,column_index].str.replace(',', '') # Remove the thousands separator
        df.iloc[:,column_index] = df.iloc[:,column_index].astype(np.float64) # Convert the column to float64
        
    return df

def scrape_table(url):
    # Fetch the page that we're going to parse
    page = get_page(url);

    # Parse the page with LXML, so that we can start doing some XPATH queries
    # to extract the data that we want
    tree = html.fromstring(page.content)

    # Fetch all div elements which have class 'D(tbr)'
    table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
    
    # Ensure that some table rows are found; if none are found, then it's possible
    # that Yahoo Finance has changed their page layout, or have detected
    # that you're scraping the page.
    assert len(table_rows) > 0
    
    df = parse_rows(table_rows)
    df = clean_data(df)
        
    return df


In [40]:
def scrape(symbol):
    print('Attempting to scrape data for ' + symbol)

    df_balance_sheet = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol)
    df_balance_sheet = df_balance_sheet.set_index('Date')

    df_income_statement = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/financials?p=' + symbol)
    df_income_statement = df_income_statement.set_index('Date')
    
    df_cash_flow = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/cash-flow?p=' + symbol)
    df_cash_flow = df_cash_flow.set_index('Date')
    
    df_joined = df_balance_sheet \
        .join(df_income_statement, on='Date', how='outer', rsuffix=' - Income Statement') \
        .join(df_cash_flow, on='Date', how='outer', rsuffix=' - Cash Flow') \
        .dropna(axis=1, how='all') \
        .reset_index()
            
    df_joined.insert(1, 'Symbol', symbol)
    
    return df_joined

In [41]:
def scrape_multi(symbols):
    joint = [scrape(df) for df in symbols]
    return pd.concat(joint,sort=False)

In [44]:
symbols = ['AAPL', 'SPY']
df_combined = scrape_multi(symbols)

Attempting to scrape data for AAPL
Attempting to scrape data for SPY


AssertionError: 

In [43]:
df_combined

Unnamed: 0,index,Symbol,Date,Total Assets,Total Liabilities Net Minority Interest,Total Equity Gross Minority Interest,Total Capitalization,Common Stock Equity,Net Tangible Assets,Working Capital,...,Capital Expenditure,Issuance of Capital Stock,Issuance of Debt,Repayment of Debt,Repurchase of Capital Stock,Free Cash Flow,Capital Lease Obligations,Average Dilution Earnings,Total Unusual Items Excluding Goodwill,Total Unusual Items
0,9/30/2021,AAPL,9/30/2021,351002000.0,287912000.0,63090000.0,172196000.0,63090000.0,63090000.0,9355000.0,...,-11085000.0,1105000.0,20393000.0,-8750000.0,-85971000.0,92953000.0,,,,
1,9/30/2020,AAPL,9/30/2020,323888000.0,258549000.0,65339000.0,164006000.0,65339000.0,65339000.0,38321000.0,...,-7309000.0,880000.0,16091000.0,-12629000.0,-72358000.0,73365000.0,,,,
2,9/30/2019,AAPL,9/30/2019,338516000.0,248028000.0,90488000.0,182295000.0,90488000.0,90488000.0,57101000.0,...,-10495000.0,781000.0,6963000.0,-8805000.0,-66897000.0,58896000.0,,,,
3,9/30/2018,AAPL,9/30/2018,365725000.0,258578000.0,107147000.0,200882000.0,107147000.0,107147000.0,14473000.0,...,-13313000.0,669000.0,6969000.0,-6500000.0,-72738000.0,64121000.0,,,,
4,,AAPL,ttm,,,,,,,,...,-10388000.0,,,-7750000.0,-81674000.0,101853000.0,,,,
0,12/31/2021,FB,12/31/2021,165987000.0,41108000.0,124879000.0,124879000.0,124879000.0,105048000.0,45531000.0,...,-18567000.0,,,-677000.0,-44537000.0,39116000.0,13873000.0,,-140000.0,-140000.0
1,12/31/2020,FB,12/31/2020,159316000.0,31026000.0,128290000.0,128290000.0,128290000.0,108617000.0,60689000.0,...,-15115000.0,,,-604000.0,-6272000.0,23632000.0,10654000.0,,-129000.0,-129000.0
2,12/31/2019,FB,12/31/2019,133376000.0,32322000.0,101054000.0,101054000.0,101054000.0,81445000.0,51172000.0,...,-15102000.0,,,-552000.0,-4202000.0,21212000.0,10324000.0,0.0,-105000.0,-105000.0
3,12/31/2018,FB,12/31/2018,97334000.0,13207000.0,84127000.0,84127000.0,84127000.0,64532000.0,43463000.0,...,-13915000.0,,,0.0,-12879000.0,15359000.0,,1000.0,-213000.0,-213000.0
4,,FB,ttm,,,,,,,,...,-18567000.0,,,-677000.0,-44537000.0,39116000.0,,,-140000.0,-140000.0


In [46]:
scrape('nvda')

Attempting to scrape data for nvda


Unnamed: 0,index,Symbol,Date,Total Assets,Total Liabilities Net Minority Interest,Total Equity Gross Minority Interest,Total Capitalization,Common Stock Equity,Capital Lease Obligations,Net Tangible Assets,...,Investing Cash Flow,Financing Cash Flow,End Cash Position,Income Tax Paid Supplemental Data,Interest Paid Supplemental Data,Capital Expenditure,Issuance of Debt,Repayment of Debt,Repurchase of Capital Stock,Free Cash Flow
0,1/31/2022,nvda,1/31/2022,44187000.0,17575000.0,26612000.0,37558000.0,26612000.0,741000.0,19924000.0,...,-9830000.0,1865000.0,1990000.0,396000.0,246000.0,-976000.0,4977000.0,-1000000.0,,8132000.0
1,1/31/2021,nvda,1/31/2021,28791000.0,11898000.0,16893000.0,22857000.0,16893000.0,755000.0,9963000.0,...,-19675000.0,3804000.0,847000.0,249000.0,138000.0,-1128000.0,4968000.0,0.0,0.0,4694000.0
2,1/31/2020,nvda,1/31/2020,17315000.0,5111000.0,12204000.0,14195000.0,12204000.0,652000.0,11537000.0,...,6145000.0,-792000.0,10896000.0,176000.0,54000.0,-489000.0,,0.0,0.0,4272000.0
3,1/31/2019,nvda,1/31/2019,13292000.0,3950000.0,9342000.0,11330000.0,9342000.0,,8679000.0,...,-4097000.0,-2866000.0,782000.0,61000.0,55000.0,-600000.0,,-16000.0,-1579000.0,3143000.0
4,,nvda,ttm,,,,,,,,...,-9830000.0,1865000.0,1990000.0,396000.0,246000.0,-976000.0,4977000.0,-1000000.0,,8132000.0
