In [1]:
!pip install requests




In [2]:
!pip install beautifulsoup4




## 1. Importing Dependencies


In [3]:
import requests
from bs4 import BeautifulSoup as bs

## 2. Load the WebPage Content

In [7]:
url = 'https://keithgalli.github.io/web-scraping/example.html'
send_request = requests.get(url)

## 3. Convert HMTL Content into a BeautifulSoup Object

In [8]:
soup = bs(send_request.content)

In [12]:
#to print content in a more indented way for easy read
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## 4. Start Using BeautifulSoup to Scrape

In [23]:
# find --> returns just the first h2 tag's element in the webpage.
first_header = soup.find("h2")

first_header

<h2>A Header</h2>

In [24]:
# find_all --> returns all the elements of all h2 tags in the webpage.

all_headers = soup.find_all("h2")
all_headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [26]:
# you can pass in a list of tags you're looking for..

all_headers = soup.find_all(['h1','h2'])
all_headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [28]:
# you can pass in attributes of tags you're looking for..

paragraph = soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [29]:
paragraph = soup.find_all("p",attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [41]:
# when you wanna search tags inside a particular section, say here body tag, you found body tag, and then you looked for div in body tag and then you needed paragraph tag in div tag..

body = soup.find('body')
print(body)



<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>


In [39]:
div = body.find('div')
print(div)

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>


In [40]:
para = div.find('p')
para

<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>

In [44]:
# to search for specific string in the webpage
# here, we are looking for the specific string "text" in the entire web page in paragraphs tags

import re 

para1 = soup.find_all('p',string = re.compile("text"))
para1

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [47]:
# to find matches for the specific string with case ignorance
headers = soup.find_all('h2',string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [49]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
# using select (CSS Selector)

# 1.  selecting tags... very similar to find method

content = soup.select("p")
content

In [52]:
# 2. to grab certain tags that are inside a div sections

para2 = soup.select("div p")
para2

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [53]:
# 3. to grab tags preceding some other tags..

para3 = soup.select("h2 ~ p")
para3

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [54]:
# 4. to grab a tag with it's id..

bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

## 6. Different Properties of the HTML

In [55]:
# get.text(), string --> to get texts inside tags

header = soup.find("h2")
header

<h2>A Header</h2>

In [56]:
header.string

'A Header'

In [57]:
div_sec = soup.find("div")
print(div_sec.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [59]:
print(div_sec.string)

None


In [61]:
print(div_sec.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [65]:
# to get the links inside 'a'  href tag..

link = soup.find("a")
link

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [63]:
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [73]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## 7. Code Navigation

In [71]:
# you can directly grab tags as follow..
# path syntax
soup.body.div.h1.string

'HTML Webpage'

In [72]:
# parent, siblings, children
# find next sibling(s)

In [80]:
div_sibling = soup.body.find("div").findChildren()

In [82]:
div_sibling

[<h1>HTML Webpage</h1>,
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]

In [84]:
for i in div_sibling:
    just_text = i.string
    print(just_text)

HTML Webpage
None
keithgalli.github.io/web-scraping/webpage.html


## 8.1 Exercises

In [85]:
url = "https://keithgalli.github.io/web-scraping/webpage.html"

In [86]:
send_request = requests.get(url)

In [88]:
#if response is 200, then our request is processed successfully.
print(send_request)

<Response [200]>


In [91]:
webpage = bs(send_request.content)

In [92]:
# display the contents of soup object

print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

In [110]:
# grab all of the social links in the webpage

# 1. using select

In [119]:
links = webpage.select("ul.socials a")
for link in links:
    actual_link = link['href']
    print(actual_link)

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [125]:
# 2. using find/find_all

ulist = webpage.find("ul",attrs={"class":"socials"})
ulist

<ul class="socials">
<li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
<li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
<li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
<li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>
</ul>

In [129]:
links = ulist.find_all("a")
links
actual_link1 = [link['href'] for link in links]

In [130]:
actual_link1

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [133]:
# 3. 

links2 = webpage.select("li.social a")
links2
actual_link3 = [link['href'] for link in links2]
actual_link3

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## 8.2 Scrape the Table

In [139]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
table

<table class="hockey-stats">
<thead>
<tr>
<th class="season" data-sort="">S</th>
<th class="team" data-sort="team">Team</th>
<th class="league" data-sort="league">League</th>
<th class="regular gp" data-sort="gp">GP</th>
<th class="regular g" data-sort="g">G</th>
<th class="regular a" data-sort="a">A</th>
<th class="regular tp" data-sort="tp">TP</th>
<th class="regular pim" data-sort="pim">PIM</th>
<th class="regular pm" data-sort="pm">+/-</th>
<th class="separator"> </th>
<th class="postseason">POST</th>
<th class="postseason gp" data-sort="playoffs-gp">GP</th>
<th class="postseason g" data-sort="playoffs-g">G</th>
<th class="postseason a" data-sort="playoffs-a">A</th>
<th class="postseason tp" data-sort="playoffs-tp">TP</th>
<th class="postseason pim" data-sort="playoffs-pim">PIM</th>
<th class="postseason pm" data-sort="playoffs-pm">+/-</th>
</tr>
</thead>
<tbody>
<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team"

In [143]:
columns = webpage.find("thead").find_all("th")
columns

[<th class="season" data-sort="">S</th>,
 <th class="team" data-sort="team">Team</th>,
 <th class="league" data-sort="league">League</th>,
 <th class="regular gp" data-sort="gp">GP</th>,
 <th class="regular g" data-sort="g">G</th>,
 <th class="regular a" data-sort="a">A</th>,
 <th class="regular tp" data-sort="tp">TP</th>,
 <th class="regular pim" data-sort="pim">PIM</th>,
 <th class="regular pm" data-sort="pm">+/-</th>,
 <th class="separator"> </th>,
 <th class="postseason">POST</th>,
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>,
 <th class="postseason g" data-sort="playoffs-g">G</th>,
 <th class="postseason a" data-sort="playoffs-a">A</th>,
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>,
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>,
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>]

In [144]:
column_names = [c.string for c in columns]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [152]:
table_rows = webpage.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

In [153]:
l

[['2014-15',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '17',
  '3',
  '9',
  '12',
  '20',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2015-16',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '9',
  '1',
  '1',
  '2',
  '2',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2016-17',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '12',
  '5',
  '5',
  '10',
  '8',
  '0',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2017-18',
  'Did not play',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2018-19',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA III',
  '8',
  '5',
  '10',
  '15',
  '8',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  '']]

In [154]:
df = pd.DataFrame(l,columns=column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [162]:
# grab all fun facts that has the string "is"

import re

facts_sec = webpage.select("ul.fun-facts li")
facts_sec

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>,
 <li>Middle name is Ronald</li>,
 <li>Never had been on a plane until college</li>,
 <li>Dunkin Donuts coffee is better than Starbucks</li>,
 <li>A favorite book series of mine is <i>Ender's Game</i></li>,
 <li>Current video game of choice is <i>Rocket League</i></li>,
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>]

In [163]:
fun_facts = [fact.get_text() for fact in facts_sec]

In [167]:
#for fact in fun_facts:
    #actual_fun_facts = fact.find(string=re.compile("is"))
    #print(actual_fun_facts)

TypeError: find() takes no keyword arguments

## 8.3 Download an Image from the webpage