## Load the necessary libraries

In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup as bs

## Load our first page

In [6]:
# Load the webpage content

r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to the BS object
soup = bs(r.content)

# Print out the html
print(soup.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## Start using Beautiful Soup to SCRAPE

### find and find_all

In [7]:
first_header = soup.find("h2")
print(first_header)

<h2>About me</h2>


In [8]:
headers = soup.find_all("h2") # converts to list
headers

[<h2>About me</h2>,
 <h2>Social Media</h2>,
 <h2>Photos</h2>,
 <h2> Table </h2>,
 <h2>Mystery Message Challenge!</h2>,
 <h2>Footnotes</h2>]

In [9]:
# Pass in a list of elements to look for

first_header = soup.find(["h2", "h1"])

In [10]:
first_header

<h1>Welcome to my page!</h1>

In [12]:
headers = soup.find_all(["h2", "h1"])
headers

[<h1>Welcome to my page!</h1>,
 <h2>About me</h2>,
 <h2>Social Media</h2>,
 <h2>Photos</h2>,
 <h2> Table </h2>,
 <h2>Mystery Message Challenge!</h2>,
 <h2>Footnotes</h2>]

In [15]:
# Pass in attributes to the find/find_all function

paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[]

In [19]:
# You can nest find/find_all calls
body = soup.find("body")
div = body.find("div")
div
header = div.find("h1")
header

In [22]:
# We can search specific strings in our find/find_all calls

string_search = soup.find_all("p", string="Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!")
string_search

[<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>]

In [23]:
import re
string_search = soup.find_all("p", string=re.compile("Hi, my"))
string_search

[<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>]

### select (CSS selector)

In [26]:
print(soup.body.prettify())

<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </a>
 </p>
 <p>
  I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.
 </p>
 <h3>
  Hobbies
 </h3>
 <p>
  Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skat

In [28]:
content = soup.select("p a")
content

[<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>]

In [30]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>,
 <p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></p>,
 <p>I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.</p>,
 <p>Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are <i>Settlers of Catan</i> and <i>O

In [33]:
italics_text = soup.select("li i")
italics_text

[<i>Ender's Game</i>, <i>Rocket League</i>, <i>Zac Brown Band</i>]

In [34]:
paragraphs = soup.select("body > p")
paragraphs

[<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>,
 <p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></p>,
 <p>I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.</p>,
 <p>Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are <i>Settlers of Catan</i> and <i>O

In [35]:
for paragraph in paragraphs:
    print(paragraph.select("i"))

[]
[]
[]
[<i>Settlers of Catan</i>, <i>Othello</i>]
[]
[]


In [42]:
# Grab by element with specific property
soup.select("[align=left]")

[<div align="left" class="block">
 <ul>
 <li><a href="challenge/file_1.html">File 1</a></li>
 <li><a href="challenge/file_2.html">File 2</a></li>
 <li><a href="challenge/file_3.html">File 3</a></li>
 <li><a href="challenge/file_4.html">File 4</a></li>
 <li><a href="challenge/file_5.html">File 5</a></li>
 </ul>
 </div>]

## Get differenet properties of the HTML

In [47]:
header = soup.find("h2")
header.string

'About me'

In [48]:
# If multiple child elements use GET_TEXT

div = soup.find("div")
print(div.prettify())
print(div.string)
print(div.get_text())

<div class="row">
 <div class="column">
  <img alt="Lake Como" src="images/italy/lake_como.jpg" style="height:100%"/>
 </div>
 <div class="column">
  <img alt="Pontevecchio, Florence" src="images/italy/pontevecchio.jpg" style="height:100%"/>
 </div>
 <div class="column">
  <img alt="Riomaggiore, Cinque de Terre" src="images/italy/riomaggiore.jpg" style="height:100%"/>
 </div>
</div>

None













In [50]:
# Get a specific property from an element

link = soup.find("a")
link["href"]

'https://www.youtube.com/kgmit'

In [57]:
paragraphs = soup.select("p#footer")
paragraphs[0]["id"]

'footer'

## Code navigation

In [64]:
#Path Syntax

print(soup.body.h1.string)

Welcome to my page!


In [67]:
# Parent, Sibling, Child

soup.body.find("div").find_next_siblings()

[<div></div>,
 <h2> Table </h2>,
 <br/>,
 <table class="hockey-stats">
 <thead>
 <tr>
 <th class="season" data-sort="">S</th>
 <th class="team" data-sort="team">Team</th>
 <th class="league" data-sort="league">League</th>
 <th class="regular gp" data-sort="gp">GP</th>
 <th class="regular g" data-sort="g">G</th>
 <th class="regular a" data-sort="a">A</th>
 <th class="regular tp" data-sort="tp">TP</th>
 <th class="regular pim" data-sort="pim">PIM</th>
 <th class="regular pm" data-sort="pm">+/-</th>
 <th class="separator"> </th>
 <th class="postseason">POST</th>
 <th class="postseason gp" data-sort="playoffs-gp">GP</th>
 <th class="postseason g" data-sort="playoffs-g">G</th>
 <th class="postseason a" data-sort="playoffs-a">A</th>
 <th class="postseason tp" data-sort="playoffs-tp">TP</th>
 <th class="postseason pim" data-sort="playoffs-pim">PIM</th>
 <th class="postseason pm" data-sort="playoffs-pm">+/-</th>
 </tr>
 </thead>
 <tbody>
 <tr class="team-continent-NA">
 <td class="season sorte

## Task 1

In [72]:
links = soup.select("ul.socials a")

In [73]:
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [86]:
links = soup.find("ul", attrs={"class": "socials"})
links_v = links.find_all("a")
actual_links = [link["href"] for link in links_v]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [88]:
links_vv = soup.select("li.social a")
actual_links = [link["href"] for link in links_vv]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Task 2

In [107]:
import pandas as pd


table = soup.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]
column_names


table_rows = table.find("tbody").find_all("tr")
l =[]
for tr in table_rows:
    td = tr.find_all("td")
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

    
df = pd.DataFrame(l, columns=column_names)
df.loc[df["Team"] != "Did not play"]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


In [112]:
facts = soup.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

## DOWNLOAD THE IMAGE

In [115]:
url = "https://keithgalli.github.io/web-scraping/webpage.html"

images = soup.select("div.row div.column img")
image_url = images[0]["src"]
full_url = url + image_url

img_data = requests.get(full_url).content
with open("lake_como.jpg", "wb") as handler:
    handler.write(img_data)