In [26]:
import requests
from bs4 import BeautifulSoup as bs

In [27]:
#load our first page
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

#convert to bs object
soup=bs(r.content)

print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [28]:
#start scraping us Beautiful Soup
#find and find_all
first_header =soup.find("h2")
first_header

headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [29]:
first_header =soup.find_all(["h2", "h1"])
first_header

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [30]:
#pass an attribute to the find/find_all function
paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [31]:
#nest find/find_all calls
body = soup.find("body")
div = body.find("div")
header =div.find("h1")
header

<h1>HTML Webpage</h1>

In [32]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [33]:
#search specific strings in ur find/find_all calls
import re
paragraphs = soup.find_all("p", string=re.compile("Some"))
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [34]:
headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

**select (CSS Selector)**


In [39]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [40]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [41]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [42]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [43]:
paragraphs = soup.select("body > p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [45]:
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


In [None]:
#GEt different properties of HTML

In [47]:
header = soup.find("h2")
header

<h2>A Header</h2>

In [48]:
div = soup.find("div")
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [49]:
#Get a specific property from an element
link = soup.find("a")
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [50]:
paragraphs = soup.select("p#paragraph.id")
paragraphs[0]['id']

IndexError: list index out of range

In [55]:
#code navigation
soup.body.div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [None]:
#parent, sibling, child

In [56]:
soup.body.find('div')

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [57]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [58]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

#convert to bs object
webpage=bs(r.content)

print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

In [60]:
#gRab all the social links from the webpage
links = webpage.select("a")
links

[<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>,
 <a href="#footer"><sup>1</sup></a>,
 <a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> ACHA II </a>,
 <a href="https://www.elite

In [62]:
#Solution 1
links = webpage.select("ul.socials a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [66]:
#Solution 2
ulinks = webpage.find('ul', attrs={"class": "socials"})
links = ulinks.find_all("a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [68]:
#solution 3
links = webpage.select("li.social a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
#reference from stackoverflow
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
pd.DataFrame(l, columns=["A","B","C"])

In [78]:
#Scrape a table
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]


table_rows = table.find("tbody").find_all("tr")
l=[]
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df=pd.DataFrame(l, columns=column_names)
df["Team"]

0    MIT (Mass. Inst. of Tech.)
1    MIT (Mass. Inst. of Tech.)
2    MIT (Mass. Inst. of Tech.)
3                  Did not play
4    MIT (Mass. Inst. of Tech.)
Name: Team, dtype: object

In [38]:
d = requests.get("https://www.delcampe.net/en_GB")

#convert to bs object
sauce=bs(d.content)
paragraphs1 = sauce.find_all("div")
paragraphs1

[<div id="cookie-banner">
 <div class="container flex space-lg lg-1 v-center">
 <div class="main">
 <p><strong class="font-xl">Information about cookies</strong></p>
 <div class="margin-top-xxs">
 <p>Our website uses cookies for the following purposes: to provide you with the services you have requested, to ensure the security of our platform, to remember your preferences in order to make your browsing more pleasant, to produce statistics in order to adapt our website to your needs, to offer you personalized advertising according to your interests.</p>
 <p>Some of these cookies are necessary for the operation of our site, others can be set according to your preferences.
                         <a href="/en_GB/cookies">More info</a>
 </p>
 </div>
 </div>
 <div class="flex space-xs sm-1 h-right">
 <a class="btn-white" data-accept-cookies="" href="">Accept all cookies</a>
 <a class="btn-white-outline" data-target="#cookie-modal" data-toggle="modal" href="">
 <i class="fa fa-cog margin-ri