In [102]:
import requests
from bs4 import BeautifulSoup as bs
import re #~ Regular Expression (Regex)
import pandas as pd
from io import StringIO
from os.path import basename
from PIL import Image #~ This is a library used to work with images
from io import BytesIO

#### Load our first page

In [3]:
# Load the web page content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out the html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



#### Start using Beautiful Soup to Scrape

##### find and find_all

In [4]:
# Print out the first instance of a tag
first_header = soup.find("h2") #~ or soup.h2 produces the same result
print(first_header)

headers = soup.find_all("h2")
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [5]:
# Pass in a list of elements to look for
first_header = soup.find(["h1", "h2"])
first_header

headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [7]:
# You can nest find/find_all calls
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [8]:
# We can search specific strings in our find/find_all calls
paragraphs = soup.find_all("p", string=re.compile("Some")) #~ compile allows us to use regex to search for strings
paragraphs

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

#### Select (CSS Selector)

In [9]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [10]:
content = soup.body.div.find_all("p")
content

paragraphs = soup.select("h2 ~ p") #~ Selects all paragraphs that are siblings of an h2 tag
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [11]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [12]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select("i")) #~ Select all italic text within the paragraph

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


#### Get different properties of the HTML

In [13]:
header = soup.find('h2')
header.string

div = soup.find("div")
print(div.prettify())
print(div.string) #~ None
print(div.get_text()) #~ This will return all the text within the div
#~ This is useful when you want to get all the text within a tag, but not the text in the children tags

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [14]:
# Get a specific property from an element
link = soup.find("a")
link['href'] #~ This will return the value of the href attribute (only the link)

paragraphs = soup.select("p#paragraph-id") #~ # is used to select by id, . is used to select by class
paragraphs[0]['id']

'paragraph-id'

#### Code navigation

In [15]:
# Path syntax 
print(soup.prettify())
soup.body.div.h1.string

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



'HTML Webpage'

In [16]:
# Know the terms: Parent, Sibling, Child
soup.body.find("div").find_next_siblings() #~ This will return all the siblings of the div tag

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

#### Exercises

##### Go to https://keithgalli.github.io/web-scraping/webpage.html

In [17]:
# Load the web page content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

# Print out the html
print(webpage.prettify())

<head>
 <title>
  Keith Galli's Page
 </title>
 <style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
 </style>
</head>
<body>
 <h1>
  Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
  About me
 </h2>
 <p>
  Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
  Here is a link to my channel:
  <a href="https://www.youtube.com/kgmit">
   youtube.com/kgmit
  </

#### Exercise #1: Grab all social links on webpage in 3 different ways

In [18]:
#1
socials1 = webpage.find('ul', attrs={'class':'socials'})
socials2 = list(socials1)
# socials3 = socials2[5].find('a')
for i in range(len(socials2)):
    if i % 2 != 0:
        print(socials2[i].find('a')['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [19]:
socials1 = webpage.find_all('ul', attrs={'class':'socials'})
socials2 = socials1[0].find_all('a')
for i in range(len(socials2)):
    print(socials2[i]['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [20]:
socials1 = webpage.select('ul.socials li a')
for i in range(len(socials1)):
    print(socials1[i]['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [21]:
# Solutions by Keith Galli
#1
links = webpage.select("ul.socials a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [24]:
#2
links = webpage.find("ul", attrs={"class": "socials"}).find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [27]:
#3
links = webpage.select("li.social a")
links
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [75]:
#4 (By me)
urls_site = webpage.get_text() #~ This will return all the text within the webpage
urls = re.findall(r'https?://[^\s]+', urls_site) #~ This will return all the urls in the webpage
urls

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

#### Exercise #2: Scrape an HTML table into a Pandas Dataframe

In [45]:
table = webpage.select("table.hockey-stats")[0] #~ [0] is used to select the first element in the list (since select returns a list)
table
df = pd.read_html(StringIO(str(table)))[0] #~ StringIO is used to convert the string to a file-like object
df.head()
#~ This is a very useful way to convert html tables to pandas dataframes
#~ read_html returns a list of dataframes, so we need to select the first element in the list

#~ Now we will remove columns with NaN values and unnamed columns
df = df.dropna(axis=1, how='all') #~ axis=1 is used to drop columns, how='all' is used to drop columns with all NaN values
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] #~ This will remove all columns that start with 'Unnamed'
df.head()


Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0
3,2017-18,Did not play,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,


#### Exercise #3: Grab all fun facts that contain the word “is”

In [50]:
fun_facts = webpage.select('ul.fun-facts li')
facts_with_is = [fact.get_text() for fact in fun_facts if 'is' in fact.get_text()] 
facts_with_is
#~ This is a list comprehension that will return all the facts that contain the word 'is'
#~ It is a for loop that iterates through the fun_facts list, and for each fact in the list, it checks if the word 'is' is in the fact, and if it is, it adds the fact to the list

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

#### Exercise #4: Use beautiful soup to help download an image from a webpage

In [104]:
url_site = 'https://keithgalli.github.io/web-scraping/'
images = webpage.select('div.row div.column img')
images_url = [url_site + image['src'] for image in images]
images_url
for image in images_url:
    image_data = requests.get(image).content 
    image_name = image.split('/')[-1]
    img = Image.open(BytesIO(image_data)) #~ BytesIO is used to convert the image data to a file-like object
    img.save(image_name) #~ This will save the image to the current directory

#### Exercise #5: Solve the mystery challenge!!!

In [109]:
files = webpage.select('div.block a')
relative_files = [file['href'] for file in files]
for file in relative_files:
    full_url = url_site + file
    page = requests.get(full_url)
    bs_page = bs(page.content)
    print(bs_page.find('p', attrs={'id':'secret-word'}).string)

Make
sure
to
smash
that
like
button
and
subscribe
!!!
