### Bloomberg WebScraping

In the following noteook I will scrape bloomberg website and retrieve data from a comapany and put everything into a pandas dataframe.

In [64]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd



In [5]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [6]:
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers.get('Content-Type').lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [7]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything, like using a logger
    """
    print(e)

In [8]:
COMPANY = 'AAPL'
COUNTRY = 'US'

In [9]:
URL = f"https://www.bloomberg.com/quote/{COMPANY}:{COUNTRY}"

In [26]:
DRIVER_PATH = "/usr/local/bin/chromedriver"

In [23]:
raw_html = simple_get(URL)

In [24]:
html = BeautifulSoup(raw_html, 'html.parser')

In [32]:
def get_using_selenuim(URL=URL):
    """
    try to get the wesite content using selenuim webdriver
    
    driver : selenuim webdriver
    """
    with webdriver.Chrome(DRIVER_PATH) as driver:
        driver.get(URL)
        html_content = driver.page_source
        return html_content

In [33]:
html_content = get_using_selenuim(URL)

In [34]:
html = BeautifulSoup(html_content, 'html.parser')

According to the site data we need to read it the left pane and the right pane....

In [35]:
left_pannel = html.find_all('div', class_="left__fe2675a4")
right_pannel = html.find_all('div', class_="right__913e6006")

In [37]:
right_pannel

[<div class="rowList__9489bc6c right__913e6006"><div class="rowListItemWrap__4121c877"><div><span class="fieldLabel__9f45bef7"><span>1 Year Return</span></span><span class="fieldValue__2d582aa7">33.17%</span></div></div><div class="rowListItemWrap__4121c877"><div><span class="fieldLabel__9f45bef7"><span>30 Day Avg Volume</span></span><span class="fieldValue__2d582aa7">63,535,448</span></div></div><div class="rowListItemWrap__4121c877"><div><span class="fieldLabel__9f45bef7"><span>EPS</span></span><span class="fieldValue__2d582aa7">12.65</span></div></div><div class="rowListItemWrap__4121c877"><div><span class="fieldLabel__9f45bef7"><span>Dividend</span></span><span class="fieldValue__2d582aa7">1.25%</span></div></div><div class="rowListItemWrap__4121c877"><div><span class="fieldLabel__9f45bef7"><span>Last Dividend Reported</span></span><span class="fieldValue__2d582aa7">0.77</span></div></div></div>]

In [41]:
left_rows = left_pannel[0].find_all('div', class_="rowListItemWrap__4121c877")
right_rows = right_pannel[0].find_all('div', class_="rowListItemWrap__4121c877")

In [60]:
labels = []
values = []
units = []
for row in left_rows + right_rows:
    label = row.find('span', class_="fieldLabel__9f45bef7").find('span')
    value = row.find('span', class_="fieldValue__2d582aa7")
    if "%" in value.text:
        value = value.text.replace('%', '')
        units.append('Percents')
    elif 'B' in value.text:
        value = value.text.replace('B', '')
        units.append('B')
    else:
        value = value.text
        units.append(None)
    labels.append(label.text)
    values.append(value.replace(',',''))

In [63]:
units

[None, None, 'B', None, None, 'Percents', None, None, 'Percents', None]

In [68]:
stock_data = pd.DataFrame(index=labels, data=zip(values, units), columns=['Values', 'Units'])

In [73]:
stock_data.index.name = 'Label'

In [75]:
stock_data

Unnamed: 0_level_0,Values,Units
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
P/E Ratio,19.4,
PEGY Ratio,1.4674,
Shares Outstanding,4.38,B
Price to Book Ratio,12.0248,
Price to Sales Ratio,4.1444,
1 Year Return,33.17,Percents
30 Day Avg Volume,63535448.0,
EPS,12.65,
Dividend,1.25,Percents
Last Dividend Reported,0.77,


In [78]:
assert list(stock_data.index) == ['P/E Ratio',
 'PEGY Ratio',
 'Shares Outstanding',
 'Price to Book Ratio',
 'Price to Sales Ratio',
 '1 Year Return',
 '30 Day Avg Volume',
 'EPS',
 'Dividend',
 'Last Dividend Reported']