# Scraping web data using Selenium

#### Description:

This codebook covers methods for scraping web data using Selenium.

#### Skill level:

- Beginner

### Import the required libraries
-------------------------

In [None]:
import requests
from bs4 import BeautifulSoup

### Create a request for a simple URL
-------------------------

In [2]:
response = requests.get('https://datakick.sfo2.digitaloceanspaces.com/datakick/media/platform/tutorials/scraping-web-data-using-selenium/simple.html')

### Check the content of the raw URL response
-------------------------

In [3]:
content = response.content

print("content:", content)

content: b'<!DOCTYPE html>\r\n<html>\r\n    <head>\r\n        <title>A simple example page</title>\r\n    </head>\r\n    <body>\r\n        <p>Here is some simple content for this page.</p>\r\n    </body>\r\n</html>'


### Create a parser for the raw URL response and extract specific content
-------------------------

In [4]:
parser = BeautifulSoup(content, 'html.parser')

In [5]:
# Get the body tag contents from the document
body_content = parser.body

print("<body> content:\n", body_content)

<body> content:
 <body>
<p>Here is some simple content for this page.</p>
</body>


In [6]:
# Get the p tag content from the body tag
p_content = parser.body.p

print("<p> content:", p_content)

<p> content: <p>Here is some simple content for this page.</p>


In [7]:
# Get the text from the p tag
p_text = parser.body.p.text

print("<p> text:", p_text)

<p> text: Here is some simple content for this page.


### Create a request for a URL with multiple id tags
-------------------------

In [8]:
response = requests.get('https://datakick.sfo2.digitaloceanspaces.com/datakick/media/platform/tutorials/scraping-web-data-using-selenium/simple_ids.html')

### Check the content of the raw URL response
-------------------------

In [9]:
content = response.content

print("content:", content)

content: b'<html>\r\n    <head>\r\n        <title>A simple example page</title>\r\n    </head>\r\n    <body>\r\n        <div>\r\n            <p id="first">\r\n                First paragraph.\r\n            </p>\r\n        </div>\r\n        <p id="second">\r\n            <b>\r\n                Second paragraph.\r\n            </b>\r\n        </p>\r\n    </body>\r\n</html>'


### Create a parser for the raw URL response and extract specific content
-------------------------

In [10]:
parser = BeautifulSoup(content, 'html.parser')

In [11]:
p_id_first_content = parser.find_all('p', id='first')[0]
p_id_first_text = p_id_first_content.text

print('<p id="first"> text:\n', p_id_first_text)

<p id="first"> text:
 
                First paragraph.
            


In [12]:
p_id_second_content = parser.find_all('p', id='second')[0]
p_id_second_text = p_id_second_content.text

print('<p id="second"> text:\n', p_id_second_text)

<p id="second"> text:
 

                Second paragraph.
            



### Create a request for a URL with multiple class tags
-------------------------

In [13]:
response = requests.get('https://datakick.sfo2.digitaloceanspaces.com/datakick/media/platform/tutorials/scraping-web-data-using-selenium/simple_classes.html')

### Check the content of the raw URL response
-------------------------

In [14]:
content = response.content

print("content:", content)

content: b'<html>\r\n    <head>\r\n        <title>A simple example page</title>\r\n    </head>\r\n    <body>\r\n        <div>\r\n            <p class="inner-text">\r\n                First paragraph.\r\n            </p>\r\n            <p class="inner-text">\r\n                Second paragraph.\r\n            </p>\r\n        </div>\r\n        <p class="outer-text">\r\n            <b>\r\n                First outer paragraph.\r\n            </b>\r\n        </p>\r\n        <p class="outer-text">\r\n            <b>\r\n                Second outer paragraph.\r\n            </b>\r\n        </p>\r\n    </body>\r\n</html>'


### Create a parser for the raw URL response and extract specific content
-------------------------

In [15]:
parser = BeautifulSoup(content, 'html.parser')

In [16]:
p_class_inner_content = parser.find_all('p', class_='inner-text')

for content in p_class_inner_content:
    p_class_inner_text = content.text

    print('<p class="inner-text"> text:\n', p_class_inner_text)

<p class="inner-text"> text:
 
                First paragraph.
            
<p class="inner-text"> text:
 
                Second paragraph.
            
