# Load necessary libraries

In [6]:
import requests

In [7]:
from bs4 import BeautifulSoup as bs


# Load Our Webpage

In [8]:
# load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

In [9]:
r

<Response [200]>

In [10]:
r.content

b'<html>\n<head>\n<title>HTML Example</title>\n</head>\n<body>\n\n<div align="middle">\n<h1>HTML Webpage</h1>\n<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>\n</div>\n\n<h2>A Header</h2>\n<p><i>Some italicized text</i></p>\n\n<h2>Another header</h2>\n<p id="paragraph-id"><b>Some bold text</b></p>\n\n</body>\n</html>\n'

In [11]:
# convert to a beautiful soup object
soup = bs(r.content)

In [13]:
#print out our html
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [14]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# start using beautifulsoup to scrape

In [15]:
soup

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>

In [18]:
#finds only the first element
first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [19]:
# using find all to create list of all elements
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [22]:
# Pass in a list of elements to look for will only find the first element that appears in the list i.e. 
# will return h1 regardless of list ordering since h1 appears first in the html 
first_header = soup.find(['h1','h2'])
first_header

<h1>HTML Webpage</h1>

In [24]:
# using findall and pass in a list
headers = soup.find_all(['h1','h2'])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [27]:
# you can pass in attributes to the find/finall_all function
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [33]:
# you can nest find/final_all calls

body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [42]:
# we can search specific strings in our find/find_all calls
print(soup.prettify())
import regex as re
paragraphs = soup.find_all('p',text = re.compile('Some'))
paragraphs

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [45]:
headers = soup.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [46]:
content = soup.select('p')

In [47]:
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [51]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [52]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

# CSS Selectors Reference Page: 
 https://www.w3schools.com/cssref/css_selectors.asp

In [54]:
paragraphs = soup.select('h2 ~ p')
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [63]:
bold_text = soup.select('h2 ~ p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [64]:
paragraphs = soup.select('body > p')

In [65]:
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [67]:
for paragraph in paragraphs:
    print(paragraph.select('i'))

[<i>Some italicized text</i>]
[]


In [69]:
# grab by element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

# get different properties of the html

In [71]:
# get the string within the header instead of the entire header

soup.find('h2').string

'A Header'

In [76]:
#if multiple chiild elements use get_text otherwise use .string
div = soup.find('div')
print(div.prettify())
div.get_text()
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [83]:
# get a specific property from an element 

# link tag

link = soup.find('a')

link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [86]:
paragraph = soup.select('p#paragraph-id')

In [87]:
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [100]:
# path syntax

soup.body.div.h1.string

'HTML Webpage'

In [None]:
#know the terms Parentm Sibling  Child

In [105]:
soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
Load the webpage


In [106]:
import requests
from bs4 import BeautifulSoup as bs

In [107]:
r = requests.get('https://keithgalli.github.io/web-scraping/webpage.html')

In [111]:
webpage = bs(r.content)

In [112]:
print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

## task 1: grab all the social links from the webpage
Do this in at least 3 different ways
one has the use find/find_all the other using select method


In [210]:
list_items = webpage.find_all('li',attrs={'class':'social'})
    
for item in list_items:
    print(item.find('a')['href'])


https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [285]:
for item in webpage.select('li.social  a'):
    print(item['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [287]:
actual_links = [link['href'] for link in webpage.select('li.social  a')]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [274]:
webpage.find_all('li')
    

[<li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>,
 <li>Middle name is Ronald</li>,
 <li>Never had been on a plane until college</li>,
 <li>Dunkin Donuts coffee is better than Starbucks</li>,
 <li>A favorite book series of mine is <i>Ender's Game</i></li>,
 <li>Current video game of choice is <i>Rocket League</i></li>,
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>,
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>,
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>,
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>,
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a></li>,
 <li><a href="chal

In [284]:
for item in webpage.find_all('a', text = re.compile('http')):
    print(item['href'])

https://www.instagram.com/keithgalli/
https://twitter.com/keithgalli
https://www.linkedin.com/in/keithgalli/
https://www.tiktok.com/@keithgalli


In [280]:
webpage.find_all('a')

[<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>,
 <a href="#footer"><sup>1</sup></a>,
 <a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> ACHA II </a>,
 <a href="https://www.elite

In [291]:
links = [link['href'] for link in webpage.find('ul',attrs={'class':'socials'}).find_all('a')]
links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [292]:
import pandas as pd

In [296]:
df = pd.read_html('https://keithgalli.github.io/web-scraping/webpage.html')

In [307]:
df[0]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 9,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [449]:
list_columns = [item.text for item in webpage.find('table', attrs={'class':'hockey-stats'}).find_all('th')]
list_columns

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [453]:
table_rows = webpage.select('table.hockey-stats tbody tr')
my_list = []
for row in table_rows:
    row_data = [data_row.text.strip() for data_row in row.select('td')]
    my_list.append(row_data)
my_list


[['2014-15',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '17',
  '3',
  '9',
  '12',
  '20',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2015-16',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '9',
  '1',
  '1',
  '2',
  '2',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2016-17',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA II',
  '12',
  '5',
  '5',
  '10',
  '8',
  '0',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2017-18',
  'Did not play',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  ''],
 ['2018-19',
  'MIT (Mass. Inst. of Tech.)',
  'ACHA III',
  '8',
  '5',
  '10',
  '15',
  '8',
  '',
  '|',
  '',
  '',
  '',
  '',
  '',
  '',
  '']]

In [454]:
df = pd.DataFrame(data = my_list, columns = list_columns)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


In [481]:
is_list = []
for sentence in webpage.select('ul.fun-facts li'):
    if 'is' in sentence.text:
        is_list.append(sentence.text)

In [482]:
is_list

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

In [489]:
webpage.find_all('li',text=re.compile(' is '))

[<li>Middle name is Ronald</li>,
 <li>Dunkin Donuts coffee is better than Starbucks</li>]

In [523]:
facts = webpage.select('ul.fun-facts li')
fact_list = [fact.find(text=re.compile('is')) for fact in facts]
fact_list = [fact.find_parent().get_text() for fact in fact_list if fact]

In [524]:
fact_list

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

In [532]:
img_url = webpage.select('div.row img')[0]['src']
img_url

'images/italy/lake_como.jpg'

In [538]:
url = 'https://keithgalli.github.io/web-scraping/'

In [539]:
full = url + img_url
full

'https://keithgalli.github.io/web-scraping/images/italy/lake_como.jpg'

In [541]:
import requests

In [542]:
img_data = requests.get(full).content

In [550]:
with open('image_name.jpg','wb') as handler:
    handler.write(img_data)

In [555]:
links = [link['href'] for link in webpage.select('div.block li a')]
links

['challenge/file_1.html',
 'challenge/file_2.html',
 'challenge/file_3.html',
 'challenge/file_4.html',
 'challenge/file_5.html',
 'challenge/file_6.html',
 'challenge/file_7.html',
 'challenge/file_8.html',
 'challenge/file_9.html',
 'challenge/file_10.html']

In [582]:
ans = []
for link in links:
    r = requests.get(url+link)
    file = bs(r.content)
    word = file.select('p#secret-word')[0].text
    ans.append(word)

In [583]:
ans

['Make',
 'sure',
 'to',
 'smash',
 'that',
 'like',
 'button',
 'and',
 'subscribe',
 '!!!']

In [579]:
r = requests.get(url+links[0])

In [581]:
file = bs(r.content).select('p#secret-word')
file

[<p id="secret-word">Make</p>]

In [585]:
sentence = ' '.join(ans)

In [586]:
sentence

'Make sure to smash that like button and subscribe !!!'