### Beautiful Soup

In [4]:
from bs4 import BeautifulSoup as bs 

In [5]:
html = """
<!DOCTYPE html>
<html>
<head>
    <title>Beautiful Soup Demo</title>
    <meta charset="UTF-8">
    <meta name="description" content="This is a really exciting website">
    <meta name="keywords" content="real excitement">
</head>
<body>

</body>
</html>
"""

In [6]:
html_demo = bs(html, 'html.parser')

In [7]:
html_demo.head

<head>
<title>Beautiful Soup Demo</title>
<meta charset="utf-8"/>
<meta content="This is a really exciting website" name="description"/>
<meta content="real excitement" name="keywords"/>
</head>

In [8]:
html_demo('meta')

[<meta charset="utf-8"/>,
 <meta content="This is a really exciting website" name="description"/>,
 <meta content="real excitement" name="keywords"/>]

In [9]:
html_demo.meta.extract()

<meta charset="utf-8"/>

In [10]:
html_demo.find('meta')

<meta content="This is a really exciting website" name="description"/>

## Viewing Source Code

In [12]:
from bs4 import BeautifulSoup as bs
import requests

In [17]:
link = 'https://packtpub.com'

In [19]:
def show_html(link):
    soup = bs(requests.get(link).text, 'html.parser')
    for js in soup('script'):
        js.decompose()
    return soup.prettify()

In [20]:
print(show_html(link))

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="Packt | Programming Books, eBooks &amp; Videos for Developers" name="title"/>
  <meta content="Packt is the online library and learning platform for professional developers. Learn Python, JavaScript, Angular and more with eBooks, videos and courses" name="description"/>
  <meta content="INDEX,FOLLOW" name="robots"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=1, shrink-to-fit=no" name="viewport"/>
  <meta content="charset=utf-8" name=""/>
  <title>
   Packt | Programming Books, eBooks &amp; Videos for Developers
  </title>
  <link href="https://www.packtpub.com/static/version1592830775/frontend/Packt/default/en_GB/mage/calendar.min.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="https://www.packtpub.com/static/version1592830775/frontend/Packt/default/en_GB/css/styles.min.css" media="all" rel="stylesheet" type="text/css"/>
  <link href="https://w

### Extracting Hyperlinks from the Source Code 

In [21]:
from bs4 import BeautifulSoup as bs
import requests

In [22]:
link = 'https://packtpub.com'

In [23]:
soup = bs(requests.get(link).text, 'html.parser')

In [24]:
links = soup.find_all('a')

In [25]:
type(links)

bs4.element.ResultSet

In [26]:
len(list(links))

182

In [27]:
for link in links:
    print(link.text)



Skip to Content 

 Browse All

Web Development 

Programming 

Data 

Cloud & Networking 

Mobile 

Security 

IOT & Hardware 

Business 

Browse Categories 
 Web Development 
 Programming 
 Data 
 Cloud & Networking 
 Mobile 
 Security 
 IOT & Hardware 
 Business 

All Products 

All Books 

All Videos 

Best Sellers 

Latest Releases 

Browse By 
 All Products 
 All Books 
 All Videos 
 Best Sellers 
 Latest Releases 

Python 

C# 

C++ 

Javascript 

Java 

AWS 

React 

Azure 

Angular 

PHP 

Trending 
 Python 
 C# 
 C++ 
 Javascript 
 Java 
 AWS 
 React 
 Azure 
 Angular 
 PHP 
 Free Learning

Sign In 


Subscribe 

 Browse All

Web Development 

Programming  

Data 

Cloud & Networking 

Mobile 

Security 

IOT & Hardware 

Business 

Browse Categories 
 Web Development 
 Programming 
 Data 
 Cloud & Networking 
 Mobile 
 Security 
 IOT & Hardware 
 Business 

All Products 

All Books  

All Videos 

Best Sellers 

Latest Releases 

Browse By 
 All Products 
 All Books 
 All V

### Get List of the Most Recommended Books
#### this will be a little more useful than just getting hyperlinks

In [37]:
from bs4 import BeautifulSoup as bs
import requests 

In [38]:
link = "https://www.penguinrandomhouse.com/the-read-down/21-books-youve-been-meaning-to-read"

In [39]:
def show_html(link):
    #returns a string object
    soup = bs(requests.get(link).text, 'html.parser')
    for js in soup('script'):
        js.decompose()     
    return soup.prettify()

In [40]:
print(show_html(link))

<!DOCTYPE html>
<!--[if IE 7]>
<html class="ie ie7" lang="en-US" class="no-js">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8" lang="en-US" class="no-js">
<![endif]-->
<!--[if !(IE 7) | !(IE 8) ]><!-->
<html class="no-js" lang="en-US">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="initial-scale=1,width=device-width,height=device-height" name="viewport">
   <meta content="https://www.penguinrandomhouse.com/the-read-down/21-books-youve-been-meaning-to-read" property="og:url"/>
   <meta content="PenguinRandomhouse.com" property="og:site_name"/>
   <meta content="356679881167712" property="fb:app_id"/>
   <meta content="summary" name="twitter:card"/>
   <meta content="@penguinrandomhouse" name="twitter:site"/>
   <title>
    21 Books You've Been Meaning To Read | Penguin Random House
   </title>
   <meta content="21 Books You've Been Meaning To Read | Penguin Random House" itemprop="name"/>
   <meta content="21 Books You've Been Meaning To Read | Penguin Random Hou

In [43]:
soup = bs(requests.get(link).text, 'html.parser')

In [44]:
books = soup.find('ol', {'class': 'awesome-list'})

In [45]:
books

<ol class="awesome-list">
<li authid="31231" author="Leo Tolstoy" cat-data="Literary Fiction | Fiction Classics | Historical Fiction" class="inner-facade" data-ebook="9780307806581" data-format="Paperback" data-format-name="Paperback" data-isbn="9781400079988" format-data="Paperback,Ebook" id="book1" imprint-data="Vintage" isbn="9781400079988" ttl="War and Peace" workid="208646">
<div class="container relative">
<div class="list-number"><span>1</span></div>
<div class="list-meta-wrap">
<div class="list-meta seemoreenable">
<div class="readdown-cover-wrap-desktop" href="/books/208646/war-and-peace-by-leo-tolstoy-a-new-translation-by-richard-pevear-and-larissa-volokhonsky/">
<div class="three-d">
<div class="cover">
<span class="">
<img alt="War and Peace Book Cover Picture" class="img-responsive" src="https://images4.penguinrandomhouse.com/cover/9781400079988"/></span>
<div class="cover__backcover-container">
<img alt="War and Peace Book Cover Picture" class="cover__backcover" src="http

In [46]:
for a in soup.select('h2 a'):
    print(a.text)

War and Peace
Song of Solomon
Ulysses
The Shadow of the Wind
The Lord of the Rings
The Satanic Verses
Don Quixote
The Golden Compass
Catch-22
1984
The Kite Runner
Little Women
The Cloud Atlas
The Fountainhead
The Picture of Dorian Gray
Lolita
The Help
The Liar’s Club
Moby-Dick
Gravity’s Rainbow
The Handmaid’s Tale


### Scraping and Cleaning

In [56]:
import requests
from bs4 import BeautifulSoup as bs 
import pandas as pd 

In [57]:
link = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

In [58]:
soup = bs(requests.get(link).text, 'lxml')


In [66]:
table = soup.find(id='constituents')
print(table.text)



Symbol

Security
SEC filings
GICS Sector
GICS Sub Industry
Headquarters Location
Date first added
CIK
Founded


MMM

3M Company
reports
Industrials
Industrial Conglomerates
St. Paul, Minnesota
1976-08-09
0000066740
1902


ABT

Abbott Laboratories
reports
Health Care
Health Care Equipment
North Chicago, Illinois
1964-03-31
0000001800
1888


ABBV

AbbVie Inc.
reports
Health Care
Pharmaceuticals
North Chicago, Illinois
2012-12-31
0001551152
2013 (1888)


ABMD

ABIOMED Inc
reports
Health Care
Health Care Equipment
Danvers, Massachusetts
2018-05-31
0000815094
1981


ACN

Accenture plc
reports
Information Technology
IT Consulting & Other Services
Dublin, Ireland
2011-07-06
0001467373
1989


ATVI

Activision Blizzard
reports
Communication Services
Interactive Home Entertainment
Santa Monica, California
2015-08-31
0000718877
2008


ADBE

Adobe Inc.
reports
Information Technology
Application Software
San Jose, California
1997-05-05
0000796343
1982


AMD

Advanced Micro Devices Inc
reports
Inf

In [67]:
file = open('sp500.txt', 'w')#write the file to disk
for row in table.find_all('tr'):#find rows in the table
    row = row.text.split('\n')#split on new line chars
    if len(row) == 12:
        row.append("NA")
    if len(row) > 13:
        row.pop()
    for column in row[1:]:
        file.write(column + '\t')
    file.write('\n')

In [61]:
data= pd.read_csv('sp500.txt', sep='\t', encoding='cp1252')

In [62]:
data.columns

Index(['Symbol', 'Unnamed: 1', 'Security', 'SEC filings', 'GICS Sector',
       'GICS Sub Industry', 'Headquarters Location', 'Date first added', 'CIK',
       'Founded', 'Unnamed: 10', 'NA', 'Unnamed: 12'],
      dtype='object')

In [63]:
data.drop(columns=['Unnamed: 1','Unnamed: 10', 'NA', 'Unnamed: 12' ], inplace=True)

In [65]:
data.head(20)

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,66740,1902
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
2,ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
3,ABMD,ABIOMED Inc,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981
4,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
5,ATVI,Activision Blizzard,reports,Communication Services,Interactive Home Entertainment,"Santa Monica, California",2015-08-31,718877,2008
6,ADBE,Adobe Inc.,reports,Information Technology,Application Software,"San Jose, California",1997-05-05,796343,1982
7,AMD,Advanced Micro Devices Inc,reports,Information Technology,Semiconductors,"Santa Clara, California",2017-03-20,2488,1969
8,AAP,Advance Auto Parts,reports,Consumer Discretionary,Automotive Retail,"Raleigh, North Carolina",2015-07-09,1158449,1932
9,AES,AES Corp,reports,Utilities,Independent Power Producers & Energy Traders,"Arlington, Virginia",1998-10-02,874761,1981
