In [2]:
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd

In [3]:
symbol = 'CAP.PA'

In [4]:
def get_page(url):
    # Set up the request headers that we're going to use, to simulate
    # a request by the Chrome browser. Simulating a request from a browser
    # is generally good practice when building a scraper
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'close',
        'DNT': '1', # Do Not Track Request Header 
        'Pragma': 'no-cache',
        'Referrer': 'https://google.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
    }

    return requests.get(url, headers=headers)

def parse_rows(table_rows):
    parsed_rows = []

    for table_row in table_rows:
        parsed_row = []
        el = table_row.xpath("./div")

        none_count = 0

        for rs in el:
            try:
                (text,) = rs.xpath('.//span/text()[1]')
                parsed_row.append(text)
            except ValueError:
                parsed_row.append(np.NaN)
                none_count += 1

        if (none_count < 4):
            parsed_rows.append(parsed_row)
            
    return pd.DataFrame(parsed_rows)

def clean_data(df):
    df = df.set_index(0) # Set the index to the first column: 'Period Ending'.
    df = df.transpose() # Transpose the DataFrame, so that our header contains the account names
    
    # Rename the "Breakdown" column to "Date"
    cols = list(df.columns)
    cols[0] = 'Date'
    df = df.set_axis(cols, axis='columns', inplace=False)
    
    numeric_columns = list(df.columns)[1::] # Take all columns, except the first (which is the 'Date' column)

    for column_index in range(1, len(df.columns)): # Take all columns, except the first (which is the 'Date' column)
        df.iloc[:,column_index] = df.iloc[:,column_index].str.replace(',', '') # Remove the thousands separator
        df.iloc[:,column_index] = df.iloc[:,column_index].astype(np.float64) # Convert the column to float64
        
    return df

def scrape_table(url):
    # Fetch the page that we're going to parse
    page = get_page(url);

    # Parse the page with LXML, so that we can start doing some XPATH queries
    # to extract the data that we want
    tree = html.fromstring(page.content)

    # Fetch all div elements which have class 'D(tbr)'
    table_rows = tree.xpath("//div[contains(@class, 'D(tbr)')]")
    
    # Ensure that some table rows are found; if none are found, then it's possible
    # that Yahoo Finance has changed their page layout, or have detected
    # that you're scraping the page.
    assert len(table_rows) > 0
    
    df = parse_rows(table_rows)
    df = clean_data(df)
        
    return df

In [11]:
df_income_statement = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/financials?p=' + symbol)
df_income_statement.set_index(['Date'],inplace=True)
income_statement = df_income_statement.T

print("\nScraping the Income Statement")

income_statement 

#numbers in thousands, 18160000.0 is read as 18,160,000,000 (18.16B)


Scraping the Income Statement


Date,ttm,12/31/2021,12/31/2020,12/31/2019,12/31/2018
Total Revenue,18160000.0,18160000.0,15848000.0,14125000.0,13197000.0
Cost of Revenue,13368000.0,13368000.0,11712000.0,10274000.0,9627000.0
Gross Profit,4792000.0,4792000.0,4136000.0,3851000.0,3570000.0
Operating Expense,2732000.0,2732000.0,2378000.0,2286000.0,2159000.0
Operating Income,2060000.0,2060000.0,1758000.0,1565000.0,1411000.0
Net Non Operating Interest Income Expense,-157000.0,-157000.0,-144000.0,-62000.0,-48000.0
Pretax Income,1680000.0,1680000.0,1355000.0,1354000.0,1171000.0
Tax Provision,526000.0,526000.0,400000.0,502000.0,447000.0
Net Income Common Stockholders,1157000.0,1157000.0,957000.0,856000.0,730000.0
Diluted NI Available to Com Stockholders,1157000.0,1157000.0,957000.0,856000.0,730000.0


In [12]:
df_balance_sheet = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/balance-sheet?p=' + symbol)
df_balance_sheet.set_index(['Date'],inplace=True)
balance_sheet = df_balance_sheet.T

print("\nScraping the Balance Sheet")

balance_sheet

#numbers in thousands


Scraping the Balance Sheet


Date,12/31/2021,12/31/2020,12/31/2019,12/31/2018
Total Assets,24033000.0,21954000.0,18135000.0,16498000.0
Total Liabilities Net Minority Interest,15554000.0,15839000.0,9716000.0,9019000.0
Total Equity Gross Minority Interest,8479000.0,6115000.0,8419000.0,7479000.0
Total Capitalization,15121000.0,13230000.0,10988000.0,10713000.0
Common Stock Equity,8467000.0,6103000.0,8424000.0,7480000.0
Capital Lease Obligations,901000.0,968000.0,813000.0,80000.0
Net Tangible Assets,-3169000.0,-4792000.0,117000.0,-648000.0
Working Capital,2482000.0,1864000.0,1443000.0,1924000.0
Invested Capital,15208000.0,14181000.0,11705000.0,10757000.0
Tangible Book Value,-3169000.0,-4792000.0,117000.0,-648000.0


In [13]:
df_cash_flow = scrape_table('https://finance.yahoo.com/quote/' + symbol + '/cash-flow?p=' + symbol)
df_cash_flow.set_index(['Date'],inplace=True)
cash_flow = df_cash_flow.T

print("\nScraping the Cash Flow")

cash_flow

#numbers in thousands


Scraping the Cash Flow


Date,ttm,12/31/2021,12/31/2020,12/31/2019,12/31/2018
Operating Cash Flow,2581000.0,2581000.0,1661000.0,1794000.0,1396000.0
Investing Cash Flow,-678000.0,-678000.0,-1714000.0,-868000.0,-728000.0
Financing Cash Flow,-1746000.0,-1746000.0,562000.0,-468000.0,-611000.0
End Cash Position,3119000.0,3119000.0,2828000.0,2450000.0,2004000.0
Capital Expenditure,-266000.0,-266000.0,-206000.0,-222000.0,-236000.0
Issuance of Capital Stock,587000.0,587000.0,277000.0,253000.0,230000.0
Issuance of Debt,137000.0,137000.0,9308000.0,430000.0,525000.0
Repayment of Debt,-1498000.0,-1498000.0,-6273000.0,-448000.0,-592000.0
Repurchase of Capital Stock,-197000.0,-197000.0,-514000.0,,-483000.0
Free Cash Flow,2315000.0,2315000.0,1455000.0,1572000.0,1160000.0
