# Web Scraping Basics

### If you want to srape a website 
### 1. use an API
### 2. HTML web scraping using some tool
#### we will use BeautifulSoup Library to extract data 

In [2]:
import requests 
from bs4 import BeautifulSoup as bs

In [3]:
url = "https://keithgalli.github.io/web-scraping/example.html"

#### step 1: Get the HTML

In [4]:
r = requests.get(url)   

In [5]:
htmlContent = r.content    ## geting content fron given url

print(htmlContent)

b'<html>\n<head>\n<title>HTML Example</title>\n</head>\n<body>\n\n<div align="middle">\n<h1>HTML Webpage</h1>\n<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>\n</div>\n\n<h2>A Header</h2>\n<p><i>Some italicized text</i></p>\n\n<h2>Another header</h2>\n<p id="paragraph-id"><b>Some bold text</b></p>\n\n</body>\n</html>\n'


#### step 2: Parse the HTML

In [6]:
soup =  bs(htmlContent , 'html.parser')
##print(soup)
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



###  BeautifulSoup Methods
####  1. find and find_all methods

In [16]:
first_header = soup.find("h1")   
print(first_header)

<h1>HTML Webpage</h1>


In [17]:
print(soup.h1)   ## without using find to get "h1" tag

<h1>HTML Webpage</h1>


In [18]:
ll = soup.find_all("a")
ll     ## to get all anchors ie "a" tag

[<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>]

In [19]:
headers = soup.find(['h1' , 'h2' ])
headers    ## to get specific headers

<h1>HTML Webpage</h1>

In [20]:
## for geting a para
paras = soup.find('p')
paras


<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>

In [21]:
all_para = soup.find_all('p')
all_para                        ## print all paragraphs 
all_para = list(all_para)       ## converting to list 
all_para
all_para[1]

<p><i>Some italicized text</i></p>

In [22]:
## narrowing down the scraping

bdy = soup.find("body")
bdy

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [27]:
## looking for div within the body
div = bdy.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [50]:
## looking for button within div
## so we are traversing the html tree
btn = div.find('button')
btn


### Select (CSS selector)

In [31]:
body = soup.body
print(body.prettify())


<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [32]:
cnt  = soup.select('p')  ## similar to find_all
cnt  ## returns a list of 'p'
cnt[1]

<p><i>Some italicized text</i></p>

In [33]:
## looking for paras in div (this way we can easily norrow down our scrap)
div_p = soup.select('div p')     ### soup.select('parent child subchild _ _so on')
div_p          ## we can also print it by traversing like a tree as "div.p"

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [34]:
p_nextto_h2 = soup.select('h2 ~p')
p_nextto_h2  ## geting para just after the h2

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [35]:
paras  = soup.select("body p")
paras


[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [36]:
paras  = soup.select("body >p")
paras   ## direct approach the decendent 

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Grabbing the text/String

In [38]:

hdr  = soup.find("h1")
hdr                    ## this gives me whole tag h1

<h1>HTML Webpage</h1>

In [41]:
hdr.string            ## geting only string from tag h1

'HTML Webpage'

In [45]:
## when multiple string present in a tag then use "get_text" method


div = soup.find('div')
div
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [52]:
all_text = div.get_text()
print(all_text)        ## geting all the text in div 


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



## finding Parent , Child, Siblings


In [58]:
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [62]:

div = soup.div

print(div.prettify())



<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [64]:
### geting next siblings of div

div_siblings = div.find_next_siblings()
print(div_siblings)   ## returns the list of all siblings of div in HTML tree

[<h2>A Header</h2>, <p><i>Some italicized text</i></p>, <h2>Another header</h2>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [66]:
 ## geting parent of div which is the body tag

div_parent = div.find_parent()
print(div_parent.prettify())  

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



#### ~ Aditya Mathur

#### source - Keith Galli