In [2]:
import requests
import bs4

# Scraping a title

In [5]:
result = requests.get('https://example.com/') # call get method and put the URL
type(result) # Check the type

requests.models.Response

In [6]:
result.text # text returns the string version of the page source

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [8]:
# use of beautiful soup to parse the webpage easily
soup = bs4.BeautifulSoup(result.text,'lxml')

In [9]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [29]:
# soup.select method - grabs things from HTML document
# returns a list of the items with the enclosed text
soup.select('title')
soup.select('h1')
soup.select('p')

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [21]:
soup.select('title')[0] # indexing

<title>Example Domain</title>

In [22]:
soup.select('title')[0].getText()

'Example Domain'

In [23]:
soup.select('p')[0].getText()

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

In [24]:
soup.select('p')[1].getText()

'More information...'

In [32]:
soup.select('h1')[0].getText()

'Example Domain'

# Grabbing Elements of a Class

<table>

<thead >
<tr>
<th>
<p>Syntax to pass to the .select() method</p>
</th>
<th>
<p>Match Results</p>
</th>
</tr>
</thead>
<tbody>
<tr>
<td>
<p><code>soup.select('div')</code></p>
</td>
<td>
<p>All elements with the <code>&lt;div&gt;</code> tag</p>
</td>
</tr>
<tr>
<td>
<p><code>soup.select('#some_id')</code></p>
</td>
<td>
<p>The HTML element containing the <code>id</code> attribute of <code>some_id</code></p>
</td>
</tr>
<tr>
<td>
<p><code>soup.select('.notice')</code></p>
</td>
<td>
<p>All the HTML elements with the CSS <code>class</code> named <code>notice</code></p>
</td>
</tr>
<tr>
<td>
<p><code>soup.select('div span')</code></p>
</td>
<td>
<p>Any elements named <code>&lt;span&gt;</code> that are within an element named <code>&lt;div&gt;</code></p>
</td>
</tr>
<tr>
<td>
<p><code>soup.select('div &gt; span')</code></p>
</td>
<td>
<p>Any elements named <code class="literal2">&lt;span&gt;</code> that are <span><em >directly</em></span> within an element named <code class="literal2">&lt;div&gt;</code>, with no other element in between</p>
</td>
</tr>
<tr>

</tr>
</tbody>
</table>

In [33]:
res = requests.get('https://en.wikipedia.org/wiki/Elizabeth_Olsen')

In [34]:
soup = bs4.BeautifulSoup(res.text,'lxml')

In [35]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Elizabeth Olsen - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"1fc065d0-5ebe-4e74-9885-2a4fa87f07a8","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Elizabeth_Olsen","wgTitle":"Elizabeth Olsen","wgCurRevisionId":1067772216,"wgRevisionId":1067772216,"wgArticleId":29642053,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description matches Wikidata","Wikipedia indefinitely move-protected pages","Wikipedia pages semi-protecte

In [39]:
# what do we select from this
# class call = toctext
type(soup.select('.toctext')[0])

bs4.element.Tag

In [46]:
# iterate through this
list_comp = [list_toc[i].getText() for i in range(len(list_toc))]

In [49]:
for item in list_comp:
    print(item)

Early life and education
Career
Early roles and acclaim (2011–2014)
Marvel Cinematic Universe and continued success (2015–present)
Personal life
Acting credits
Film
Television
Theatre
Awards and nominations
Notes
References
External links


In [48]:
for item in soup.select('.toctext'):
    print(item.text)

Early life and education
Career
Early roles and acclaim (2011–2014)
Marvel Cinematic Universe and continued success (2015–present)
Personal life
Acting credits
Film
Television
Theatre
Awards and nominations
Notes
References
External links


# Scraping an Image

In [57]:
import requests
import bs4

page = requests.get('https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)')
soup = bs4.BeautifulSoup(page.text, 'lxml')

#Use select and use the img tag
soup.select('img')

[<img alt="This is a good article. Click here for more information." data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/29px-Symbol_support_vote.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/39px-Symbol_support_vote.svg.png 2x" width="19"/>,
 <img alt="Deep Blue.jpg" data-file-height="601" data-file-width="400" decoding="async" height="331" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/330px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="220"/>,
 <img alt="Chess Programming.svg" data-file-height="60" data-file-width="60" decoding="async" height="150" src="//upload.wikimedia.org/

In [56]:
# use of thumb image
soup.select('.thumbimage')

[<img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>,
 <img alt="" class="thumbimage" data-file-height="2756" data-file-width="2067" decoding="async" height="293" src="//upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/220px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/330px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg 1.5x, //upload.wiki

In [58]:
img_1= soup.select('.thumbimage')[0]

In [60]:
type(img_1)

bs4.element.Tag

In [61]:
# Do a call like a dictionary - you can just index it with src

img_1['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png'

# use of markdown 

<img
     src='//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png'>

In [62]:
img_2=soup.select('.thumbimage')[1]

In [63]:
img_2['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/220px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg'

<img
     src = '//upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/220px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg'>

In [64]:
image_link = requests.get("https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/220px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg")

In [66]:
image_link.content # returns the raw content of the actual image

b'\xff\xd8\xff\xdb\x00C\x00\x04\x03\x03\x04\x03\x03\x04\x04\x03\x04\x05\x04\x04\x05\x06\n\x07\x06\x06\x06\x06\r\t\n\x08\n\x0f\r\x10\x10\x0f\r\x0f\x0e\x11\x13\x18\x14\x11\x12\x17\x12\x0e\x0f\x15\x1c\x15\x17\x19\x19\x1b\x1b\x1b\x10\x14\x1d\x1f\x1d\x1a\x1f\x18\x1a\x1b\x1a\xff\xdb\x00C\x01\x04\x05\x05\x06\x05\x06\x0c\x07\x07\x0c\x1a\x11\x0f\x11\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\x1a\xff\xc0\x00\x11\x08\x01%\x00\xdc\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1d\x00\x00\x00\x07\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x03\x04\x05\x06\x07\x08\x00\x01\t\xff\xc4\x00S\x10\x00\x02\x01\x02\x04\x03\x05\x05\x02\t\x07\x08\t\x04\x03\x00\x01\x02\x03\x04\x11\x00\x05\x12!\x061A\x07\x13"Qa\x142q\x81\x91#\xa1\x15$BRb\x92\xb1\xc1\xd1\x08\x16%3crs45CDS\xa2\xb2\xc2\x17ETt\x82\x83\x93\xd2\xf0d\x84\xe1\xe2\xa3\xc3\xf1\xff\xc4\x00\x

In [67]:
f = open('img_2.jpg','wb') # wb = write binary
f.write(image_link.content)

17685

In [68]:
f.close()

In [69]:
image_link2=requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png')


f = open('ppl.jpg','wb')
f.write(image_link2.content)
f.close()

# Book Examples 1 and 2

In [78]:
import requests
import bs4

# GOAL - get title of every book with a two star rating

# what url procedure 

# https://books.toscrape.com/catalogue/page-1.html
# https://books.toscrape.com/catalogue/page-2.html


base_url = "https://books.toscrape.com/catalogue/page-{}.html"

# remember .format
# how to get the range

# Class Call
# star-rating One / Two / Three / Four / Five
# star-rating is under the product_pod

'https://books.toscrape.com/catalogue/page-12.html'

In [85]:
# Let's try using the first page

first_page = requests.get(base_url.format(1))

# let's call bs4
soup = bs4.BeautifulSoup(first_page.text,'lxml')

# assignt to a variable
products = soup.select('.product_pod')

In [86]:
example = products[0]

In [94]:
example

# title
# star-rating Two

#quick and dirty way - boolean check using in

'star-rating Three' in str(example)

# More reliable way

[] == example.select('.star-rating.Two') # if there is a space, use .

True

In [102]:
example.select('a')[1]['title']

'A Light in the Attic'

In [109]:
two_star_titles = []

for n in range(1,51):
    scrape_url = base_url.format(n)
    result = requests.get(scrape_url)
    soup = bs4.BeautifulSoup(result.text,'lxml')
    products = soup.select('.product_pod')
    for book in products:
        if len(book.select('.star-rating.Two'))!=0:
            two_star_titles.append(book.select('a')[1]['title'])

In [110]:
len(two_star_titles)

196

In [112]:
for book in two_star_titles:
    print(book)

Starving Hearts (Triangular Trade Trilogy, #1)
Libertarianism for Beginners
It's Only the Himalayas
How Music Works
Maude (1883-1993):She Grew Up with the country
You can't bury them all: Poems
Reasons to Stay Alive
Without Borders (Wanderlove #1)
Soul Reader
Security
Saga, Volume 5 (Saga (Collected Editions) #5)
Reskilling America: Learning to Labor in the Twenty-First Century
Political Suicide: Missteps, Peccadilloes, Bad Calls, Backroom Hijinx, Sordid Pasts, Rotten Breaks, and Just Plain Dumb Mistakes in the Annals of American Politics
Obsidian (Lux #1)
My Paris Kitchen: Recipes and Stories
Masks and Shadows
Lumberjanes, Vol. 2: Friendship to the Max (Lumberjanes #5-8)
Lumberjanes Vol. 3: A Terrible Plan (Lumberjanes #9-12)
Judo: Seven Steps to Black Belt (an Introductory Guide for Beginners)
I Hate Fairyland, Vol. 1: Madly Ever After (I Hate Fairyland (Compilations) #1-5)
Giant Days, Vol. 2 (Giant Days #5-8)
Everydata: The Misinformation Hidden in the Little Data You Consume Every 

In [113]:
import numpy as np
import pandas as pd

df = pd.DataFrame(two_star_titles, columns =['Books with 2-Star Rating'])
df.head(10)

Unnamed: 0,Books with 2-Star Rating
0,"Starving Hearts (Triangular Trade Trilogy, #1)"
1,Libertarianism for Beginners
2,It's Only the Himalayas
3,How Music Works
4,Maude (1883-1993):She Grew Up with the country
5,You can't bury them all: Poems
6,Reasons to Stay Alive
7,Without Borders (Wanderlove #1)
8,Soul Reader
9,Security
