### Bloomberg WebScraping

In the following noteook I will scrape bloomberg website and retrieve data from a comapany and put everything into a pandas dataframe.

In [64]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd



In [5]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [6]:
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers.get('Content-Type').lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [7]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything, like using a logger
    """
    print(e)

In [85]:
COMPANY = 'MSFT'
COUNTRY = 'US'

In [86]:
URL = f"https://www.bloomberg.com/quote/{COMPANY}:{COUNTRY}"

In [87]:
DRIVER_PATH = "/usr/local/bin/chromedriver"

In [88]:
def get_using_selenuim(URL=URL):
    """
    try to get the wesite content using selenuim webdriver
    
    driver : selenuim webdriver
    """
    with webdriver.Chrome(DRIVER_PATH) as driver:
        driver.get(URL)
        html_content = driver.page_source
        return html_content

In [89]:
html_content = get_using_selenuim(URL)

In [90]:
html = BeautifulSoup(html_content, 'html.parser')

According to the site data we need to read it the left pane and the right pane....

In [91]:
left_pannel = html.find_all('div', class_="left__fe2675a4")
right_pannel = html.find_all('div', class_="right__913e6006")

In [92]:
left_rows = left_pannel[0].find_all('div', class_="rowListItemWrap__4121c877")
right_rows = right_pannel[0].find_all('div', class_="rowListItemWrap__4121c877")

In [93]:
labels = []
values = []
units = []
for row in left_rows + right_rows:
    label = row.find('span', class_="fieldLabel__9f45bef7").find('span')
    value = row.find('span', class_="fieldValue__2d582aa7")
    if "%" in value.text:
        value = value.text.replace('%', '')
        units.append('Percents')
    elif 'B' in value.text:
        value = value.text.replace('B', '')
        units.append('B')
    else:
        value = value.text
        units.append(None)
    labels.append(label.text)
    values.append(value.replace(',',''))

In [94]:
units

[None, None, 'B', None, None, 'Percents', None, None, 'Percents', None]

In [95]:
stock_data = pd.DataFrame(index=labels, data=zip(values, units), columns=['Values', 'Units'])

In [96]:
stock_data.index.name = 'Label'

### Key Statistics

In [97]:
stock_data

Unnamed: 0_level_0,Values,Units
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
P/E Ratio,27.26,
PEGY Ratio,1.9515,
Shares Outstanding,7.61,B
Price to Book Ratio,10.1555,
Price to Sales Ratio,8.3658,
1 Year Return,26.26,Percents
30 Day Avg Volume,66057060.0,
EPS,5.39,
Dividend,1.39,Percents
Last Dividend Reported,0.51,


In [78]:
assert list(stock_data.index) == ['P/E Ratio',
 'PEGY Ratio',
 'Shares Outstanding',
 'Price to Book Ratio',
 'Price to Sales Ratio',
 '1 Year Return',
 '30 Day Avg Volume',
 'EPS',
 'Dividend',
 'Last Dividend Reported']

In [82]:
response = simple_get(URL)

In [83]:
parsed_response = BeautifulSoup(response, 'html.parser')

In [84]:
parsed_response

<!DOCTYPE html>

<html>
<head>
<title>Bloomberg - Are you a robot?</title>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://assets.bwbx.io/font-service/css/BWHaasGrotesk-55Roman-Web,BWHaasGrotesk-75Bold-Web,BW%20Haas%20Text%20Mono%20A-55%20Roman/font-face.css" rel="stylesheet" type="text/css"/>
<style rel="stylesheet" type="text/css">
        html, body, div, span, applet, object, iframe,
        h1, h2, h3, h4, h5, h6, p, blockquote, pre,
        a, abbr, acronym, address, big, cite, code,
        del, dfn, em, img, ins, kbd, q, s, samp,
        small, strike, strong, sub, sup, tt, var,
        b, u, i, center,
        dl, dt, dd, ol, ul, li,
        fieldset, form, label, legend,
        table, caption, tbody, tfoot, thead, tr, th, td,
        article, aside, canvas, details, embed,
        figure, figcaption, footer, header, hgroup,
        menu, nav, output, ruby, section, summary,
        time, mark, audio, video {
            margin: 0;
  