## Web_Scraping Introduction
    Techniques involving automating the gathering the data from a website

In [1]:
import requests

In [2]:
import bs4

In [3]:
import lxml

### Grab a page title

In [4]:
result = requests.get("http://www.example.com")

In [5]:
type(result)

requests.models.Response

In [6]:
result.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [7]:
soup = bs4.BeautifulSoup(result.text, "lxml")

In [8]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [9]:
soup.select('title')

[<title>Example Domain</title>]

In [10]:
soup.select('p')

[<p>This domain is for use in illustrative examples in documents. You may use this
     domain in literature without prior coordination or asking for permission.</p>,
 <p><a href="https://www.iana.org/domains/example">More information...</a></p>]

In [11]:
soup.select('h1')

[<h1>Example Domain</h1>]

In [14]:
soup.select('title')[0].getText()

'Example Domain'

In [15]:
site_paragraphs = soup.select('p')

In [19]:
site_paragraphs[0]

<p>This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.</p>

In [20]:
type(site_paragraphs[0])

bs4.element.Tag

In [21]:
site_paragraphs[0].getText()

'This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.'

### Grab a Class
        soup.select('div')  :  All elements with div tag
        '#some_id'          :  elements with that id
        '.some_class'       :  elements with that class
        'div span'          :  any element names span within a div element
        'div > span'        :  Any element named span directly within a div element, with nothing in-between

In [22]:
res = requests.get('https://en.wikipedia.org/wiki/Grace_Hopper')

In [23]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [25]:
# soup

In [26]:
soup.select('.toctext')

[<span class="toctext">Early life and education</span>,
 <span class="toctext">Career</span>,
 <span class="toctext">World War II</span>,
 <span class="toctext">UNIVAC</span>,
 <span class="toctext">COBOL</span>,
 <span class="toctext">Standards</span>,
 <span class="toctext">Retirement</span>,
 <span class="toctext">Post-retirement</span>,
 <span class="toctext">Anecdotes</span>,
 <span class="toctext">Death</span>,
 <span class="toctext">Dates of rank</span>,
 <span class="toctext">Awards and honors</span>,
 <span class="toctext">Military awards</span>,
 <span class="toctext">Other awards</span>,
 <span class="toctext">Legacy</span>,
 <span class="toctext">Places</span>,
 <span class="toctext">Programs</span>,
 <span class="toctext">In popular culture</span>,
 <span class="toctext">Grace Hopper Celebration of Women in Computing</span>,
 <span class="toctext">See also</span>,
 <span class="toctext">Notes</span>,
 <span class="toctext">Obituary notices</span>,
 <span class="toctext">Re

In [27]:
first_item = soup.select('.toctext')[0]

In [30]:
first_item.text

'Early life and education'

In [31]:
for item in soup.select('.toctext'):
    print(item.text)

Early life and education
Career
World War II
UNIVAC
COBOL
Standards
Retirement
Post-retirement
Anecdotes
Death
Dates of rank
Awards and honors
Military awards
Other awards
Legacy
Places
Programs
In popular culture
Grace Hopper Celebration of Women in Computing
See also
Notes
Obituary notices
References
Further reading
External links


### Grab an Image

In [32]:
res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)")

In [33]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [35]:
soup.select('img')[0]

<img alt="" class="thumbimage" data-file-height="601" data-file-width="400" decoding="async" height="331" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/330px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="220"/>

In [36]:
soup.select('.thumbimage')

[<img alt="" class="thumbimage" data-file-height="601" data-file-width="400" decoding="async" height="331" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/330px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="220"/>,
 <img alt="" class="thumbimage" data-file-height="600" data-file-width="800" decoding="async" height="165" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/330px-Kasparov_Magath_1985_Hamburg-2.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/440px-Kasparov_Magath_1985_Hamburg-2.png 2x" width="220"/>]

In [37]:
computer = soup.select('.thumbimage')[0]

In [38]:
computer

<img alt="" class="thumbimage" data-file-height="601" data-file-width="400" decoding="async" height="331" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/330px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="220"/>

In [39]:
type(computer)

bs4.element.Tag

In [43]:
computer['class']

['thumbimage']

In [44]:
computer['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg'

<img src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg">

In [45]:
image_link = requests.get("https://upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg")

In [47]:
# image_link.content

In [48]:
# WRITE THE IMAGE_CONTENT TO AN IMAGE FILE
f = open('computer_image.jpg', mode='wb')

In [49]:
f.write(image_link.content)

16806

In [50]:
f.close()