# Web Scraping

In [1]:
#!pip install requests
#!pip install lxml
#!pip install bs4

In [2]:
import requests
import bs4
import lxml

In [3]:
result=requests.get("http://www.example.com")
type(result)

requests.models.Response

In [4]:
result.text#gives us the html document

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [5]:
soup=bs4.BeautifulSoup(result.text, "lxml")

In [6]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

bs4 aka BeautifulSoup converts raw text to html document for vie3wing in Python

In [7]:
# grab title from html
soup.select("title")#here we have only one title tags

[<title>Example Domain</title>]

In [8]:
#grab paragraphs from html file
soup.select("p")#it might give us list as there can be many paragraphs

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [9]:
#get only title
soup.select("title")[0].getText()

'Example Domain'

In [10]:
#get string from first paragraph
soup.select("p")[0].getText()

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

In [11]:
#get string from second paragraph
soup.select("p")[1].getText()

'More information...'

## Grabbing all the elements associated with a class 

In [12]:
res=requests.get("https://en.wikipedia.org/wiki/Kenny_Omega")

In [13]:
res

<Response [200]>

In [14]:
soup=bs4.BeautifulSoup(res.text, "lxml")
#soup

In [15]:
soup.select(".toctext")#derive all the links of table of contents 

[<span class="toctext">Early life</span>,
 <span class="toctext">Professional wrestling career</span>,
 <span class="toctext">Independent circuit and WWE (2001–2011)</span>,
 <span class="toctext">DDT Pro-Wrestling (2008–2014)</span>,
 <span class="toctext">Pro Wrestling Guerrilla (2008–2014)</span>,
 <span class="toctext">New Japan Pro-Wrestling and Ring of Honor</span>,
 <span class="toctext">Sporadic appearances (2008–2014)</span>,
 <span class="toctext">Bullet Club membership (2014–2016)</span>,
 <span class="toctext">The Elite and various championship reigns (2016–2017)</span>,
 <span class="toctext">Dissension within Bullet Club and departure (2017–2019)</span>,
 <span class="toctext">All Elite Wrestling (2019–present)</span>,
 <span class="toctext">Lucha Libre AAA Worldwide (2019–present)</span>,
 <span class="toctext">Impact Wrestling (2020–present)</span>,
 <span class="toctext">Professional wrestling style and persona</span>,
 <span class="toctext">Personal life</span>,
 <spa

In [16]:
soup.select(".toctext")[0]

<span class="toctext">Early life</span>

In [17]:
type(soup.select(".toctext")[0])

bs4.element.Tag

In [18]:
first_item = soup.select(".toctext")[0]
first_item.text

'Early life'

In [19]:
#get all 
for item in soup.select(".toctext"):
    print(item.text)

Early life
Professional wrestling career
Independent circuit and WWE (2001–2011)
DDT Pro-Wrestling (2008–2014)
Pro Wrestling Guerrilla (2008–2014)
New Japan Pro-Wrestling and Ring of Honor
Sporadic appearances (2008–2014)
Bullet Club membership (2014–2016)
The Elite and various championship reigns (2016–2017)
Dissension within Bullet Club and departure (2017–2019)
All Elite Wrestling (2019–present)
Lucha Libre AAA Worldwide (2019–present)
Impact Wrestling (2020–present)
Professional wrestling style and persona
Personal life
Championships and accomplishments
References
External links


### Scraping an Image from web

In [20]:
soup.select("img")[0]
#see the src i.e. the source of the image

<img alt="This is a good article. Click here for more information." data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/29px-Symbol_support_vote.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/39px-Symbol_support_vote.svg.png 2x" width="19"/>

In [21]:
omega_16 = soup.select(".image")[0]

<img src = "//upload.wikimedia.org/wikipedia/commons/thumb/b/b8/Kenny_Omega_2016.jpg/220px-Kenny_Omega_2016.jpg">

In [22]:
image_link=requests.get("https://upload.wikimedia.org/wikipedia/commons/thumb/b/b8/Kenny_Omega_2016.jpg/220px-Kenny_Omega_2016.jpg")

In [23]:
image_link.content#raw content of actual image

b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x01,\x01,\x00\x00\xff\xfe\x00KFile source: https://commons.wikimedia.org/wiki/File:Kenny_Omega_2016.jpg\xff\xdb\x00C\x00\x06\x04\x05\x06\x05\x04\x06\x06\x05\x06\x07\x07\x06\x08\n\x10\n\n\t\t\n\x14\x0e\x0f\x0c\x10\x17\x14\x18\x18\x17\x14\x16\x16\x1a\x1d%\x1f\x1a\x1b#\x1c\x16\x16 , #&\')*)\x19\x1f-0-(0%()(\xff\xdb\x00C\x01\x07\x07\x07\n\x08\n\x13\n\n\x13(\x1a\x16\x1a((((((((((((((((((((((((((((((((((((((((((((((((((\xff\xc0\x00\x11\x08\x011\x00\xdc\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1d\x00\x00\x00\x07\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x03\x04\x05\x06\x07\x01\x08\t\xff\xc4\x00L\x10\x00\x01\x03\x03\x03\x01\x05\x06\x01\t\x02\x0b\x08\x03\x01\x00\x01\x02\x03\x04\x00\x05\x11\x06\x12!1\x07\x13AQa\x14"2q\x81\x91\xa1\x15\x16#BRb\xb1\xc1\xd1\x08r\x17$34CSs\x82\x92\x93\xe1%D\x83\xa2\xb2\xc2\xf0\xf1&5E\xd2\xff\xc4\x00\x1a\x01\x00\x02\x03\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\

In [24]:
f=open("Kenny_Omega.jpg", "wb")#wb: write binary
f.write(image_link.content)
f.close()#image saves at the file's location

-----------------------------------------------------------------

#### Title of every book with 2* rating 

In [25]:
base_url = "http://books.toscrape.com/catalogue/page-{}.html"

In [26]:
page_no=12
base_url.format(page_no)

'http://books.toscrape.com/catalogue/page-12.html'

In [27]:
page_no=1
base_url.format(page_no)

'http://books.toscrape.com/catalogue/page-1.html'

In [28]:
result=requests.get(base_url.format(1))
soup=bs4.BeautifulSoup(result.text, "lxml")
len(soup.select(".product_pod"))

20

#### Method 1 to extract star ratings

In [29]:
star=soup.select(".product_pod")
star[0]                 

<article class="product_pod">
<div class="image_container">
<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [30]:
str(star[0])

'<article class="product_pod">\n<div class="image_container">\n<a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>\n</div>\n<p class="star-rating Three">\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n<i class="icon-star"></i>\n</p>\n<h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>\n<div class="product_price">\n<p class="price_color">Â£51.77</p>\n<p class="instock availability">\n<i class="icon-ok"></i>\n    \n        In stock\n    \n</p>\n<form>\n<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>\n</form>\n</div>\n</article>'

In [31]:
"star-rating Three" in str(star[0])

True

#### Method 2

In [32]:
star=soup.select(".product_pod")
rat=star[0]
rat.select(".star-rating.Three")

[<p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>]

In [33]:
rat.select(".star-rating.Two")#check for something that isn't present

[]

In [34]:
rat.select("a")[1]["title"]

'A Light in the Attic'

In [35]:
#solving with a loop 
base_url = "http://books.toscrape.com/catalogue/page-{}.html"

two_star_titles=[]

for n in range(1,51):
    
    scrape_url=base_url.format(n)
    res=requests.get(scrape_url)
    
    soup=bs4.BeautifulSoup(res.text, "lxml")
    books=soup.select(".product_pod")
    
    for book in books:
        
        #if "star-rating Two" in str(book)
        #or,
        if len(book.select(".star-rating.Two"))!=0:
            book_title=book.select("a")[1]["title"]
            two_star_titles.append(book_title)

In [36]:
two_star_titles

['Starving Hearts (Triangular Trade Trilogy, #1)',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 'How Music Works',
 'Maude (1883-1993):She Grew Up with the country',
 "You can't bury them all: Poems",
 'Reasons to Stay Alive',
 'Without Borders (Wanderlove #1)',
 'Soul Reader',
 'Security',
 'Saga, Volume 5 (Saga (Collected Editions) #5)',
 'Reskilling America: Learning to Labor in the Twenty-First Century',
 'Political Suicide: Missteps, Peccadilloes, Bad Calls, Backroom Hijinx, Sordid Pasts, Rotten Breaks, and Just Plain Dumb Mistakes in the Annals of American Politics',
 'Obsidian (Lux #1)',
 'My Paris Kitchen: Recipes and Stories',
 'Masks and Shadows',
 'Lumberjanes, Vol. 2: Friendship to the Max (Lumberjanes #5-8)',
 'Lumberjanes Vol. 3: A Terrible Plan (Lumberjanes #9-12)',
 'Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)',
 'I Hate Fairyland, Vol. 1: Madly Ever After (I Hate Fairyland (Compilations) #1-5)',
 'Giant Days, Vol. 2 (Giant Day

**Result:** List of all the 2 star rated books