# Tuto Web Scrapping - BeautifulSoup

## Import necessary packages

In [2]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

## Load page

In [7]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")
# Convert to a beautiful soup object
soup = bs(r.content)
# Print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [16]:
## Strat using Beautiful SOup to Scrape
first_header = soup.find("h2")
first_header
headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [17]:
# Pass in a list of elements to look for
all_headers = soup.find_all(["h1", "h2"])
all_headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [22]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [23]:
# You can nest find/find_all calls
body = soup.find('body')
div = soup.body('div')
header = soup.div('h1')
header

[<h1>HTML Webpage</h1>]

In [26]:
# We can search specific strings in our find/find_all calls
import re
paragraphs = soup.find_all("p", string=re.compile("Some"))
paragraphs
headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

## Get different proprieties of HTML

In [31]:
# To get string
header = soup.find("h2")
header.string
# if multiple child element use get°text
div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [33]:
# Get a specific property from an element
link = soup.find('a')
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

## Code navigation

In [40]:
# Path syntax
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [44]:
# Know the terms : Parents, Sibling, Child
soup.body.find ("div").find_next_siblings()


[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

## Exercice

Go to https://keithgalli.github.io/web-scraping/webpage.html

In [54]:
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')
webpage = bs(r.content)
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## Grab all of the social links from webpage

In [80]:
# Way 1
links1 = webpage.find_all("a")
links1[2:6]

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [87]:
# Way 2
links2 = webpage.select("ul.socials a")
actual_links2 = [link['href'] for link in links2]
actual_links2
#[for link in links2:
#    print(link["href"])]

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [89]:
# Way 3
ulist = webpage.find("ul", attrs={"class" : "socials"})
links3 = ulist.find_all('a')
actual_links3 = [link['href'] for link in links3]
actual_links3

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [96]:
# Using pandas
import pandas as pd
url = "https://keithgalli.github.io/web-scraping/webpage.html"
table = pd.read_html(url)
df = table[0]
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 9,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   S           5 non-null      object 
 1   Team        5 non-null      object 
 2   League      4 non-null      object 
 3   GP          4 non-null      float64
 4   G           4 non-null      float64
 5   A           4 non-null      float64
 6   TP          4 non-null      float64
 7   PIM         4 non-null      float64
 8   +/-         1 non-null      float64
 9   Unnamed: 9  5 non-null      object 
 10  POST        0 non-null      float64
 11  GP.1        0 non-null      float64
 12  G.1         0 non-null      float64
 13  A.1         0 non-null      float64
 14  TP.1        0 non-null      float64
 15  PIM.1       0 non-null      float64
 16  +/-.1       0 non-null      float64
dtypes: float64(13), object(4)
memory usage: 812.0+ bytes
