In [1]:
# This example shows how to remove markup elements
# and extract only the relevant content.
#
# In this notebook, the BeautifulSoup component is used in
# the first two examples. The third example uses regular
# expressions to filter data. The last example combines BeautifulSoup,
# regular expressions and requests. All preprocessing related to
# markup removal is now wrapped by a function.
#
# Author: Fabrício Galende M. de Carvalho
#

from bs4 import BeautifulSoup

# Request response models a simple http request raw response. 
request_response_text = '''
<body>
<p>The product is good!</p>
<p>It was delivered on time. </p>
<p>Definetely I will buy it again! </p>
<script> var x = 1; </script>
<iframe> contents... </iframe>
<p>Thanks folks! </p>
</body>
'''

print("Raw response text: ")
print(request_response_text)

soup_obj = BeautifulSoup(request_response_text, "html.parser")
paragraphs = soup_obj.find_all("p")
print(paragraphs)
paragraphs_contents = [ p.text for p in paragraphs]
print(paragraphs_contents)


Raw response text: 

<body>
<p>The product is good!</p>
<p>It was delivered on time. </p>
<p>Definetely I will buy it again! </p>
<script> var x = 1; </script>
<iframe> contents... </iframe>
<p>Thanks folks! </p>
</body>

[<p>The product is good!</p>, <p>It was delivered on time. </p>, <p>Definetely I will buy it again! </p>, <p>Thanks folks! </p>]
['The product is good!', 'It was delivered on time. ', 'Definetely I will buy it again! ', 'Thanks folks! ']


In [2]:
# Now, suppose that the HTML is not well formed. In this case, one
# must extract all tags that do not carry relevant textual content
request_response_text_2 = '''
<body>
<p>The product is good!</p>
<p>It was delivered on time. </p>
<p>Definetely I will buy it again! </p>
<script> var x = 1; </script>
<div>This content is also poorly structured. </div>
<iframe> This content is not relevant. </iframe>
This paragraph is not inside any tag, so, simple tag text finding will not work here.
<p>Thanks folks! </p>
</body>
'''
soup_obj_2 = BeautifulSoup(request_response_text_2, "html.parser")
for element in soup_obj_2(["script", "style", "meta", "link", "iframe"]):
    element.decompose() #destroys elements in a recursive way

print("Get all elements after destroying irrelevant ones.")
print(soup_obj_2.find_all(True))

# Here we unwrap the content (remaining tag removal)
for element in soup_obj_2.find_all(True):
    element.unwrap()

print("\n\n\n *********** ")
print("Content after unwrapping")
print(soup_obj_2.get_text())



Get all elements after destroying irrelevant ones.
[<body>
<p>The product is good!</p>
<p>It was delivered on time. </p>
<p>Definetely I will buy it again! </p>

<div>This content is also poorly structured. </div>

This paragraph is not inside any tag, so, simple tag text finding will not work here.
<p>Thanks folks! </p>
</body>, <p>The product is good!</p>, <p>It was delivered on time. </p>, <p>Definetely I will buy it again! </p>, <div>This content is also poorly structured. </div>, <p>Thanks folks! </p>]



 *********** 
Content after unwrapping


The product is good!
It was delivered on time. 
Definetely I will buy it again! 

This content is also poorly structured. 

This paragraph is not inside any tag, so, simple tag text finding will not work here.
Thanks folks! 




In [5]:
# Now, we use regular expression to get the relevant content
import re
request_response_text_3 = '''
<body>
<p>The product is good!</p>
<p>It was delivered on time. </p>
<p>Definetely I will buy it again! </p>
<script> var x = 1; </script>
<div>This content is also poorly structured. </div>
<iframe> This content is not relevant. </iframe>
<p>Thanks folks! </p>
</body>
'''

# . any character except \n, * one or more occurence of previous element, 
# () captures only the data inside <p> </p>, i.e., its a group
regex_filter = r'<p>(.*?)</p>'  
relevant_content = re.findall(regex_filter, request_response_text_3)
print(relevant_content)


['The product is good!', 'It was delivered on time. ', 'Definetely I will buy it again! ', 'Thanks folks! ']


In [4]:
import requests
from bs4 import BeautifulSoup

# a function to get rid of html tags
def get_rid_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    # iframe and script nodes removal from doc tree
    [s.extract() for s in soup(['iframe','script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]','\n',stripped_text) #replace some special characters with new line
    return stripped_text

data = requests.get("http://gutenberg.org/cache/epub/8001/pg8001.html")
content = data.content
soup = BeautifulSoup(content, "html.parser")
clean_content = get_rid_html_tags(soup.get_text())
print(clean_content[1:1000])

The Project Gutenberg eBook of The Bible, King James version, Book 1: Genesis, by Anonymous















The Project Gutenberg eBook of The Bible, King James version, Book 1: Genesis
This ebook is for the use of anyone anywhere in the United States and

most other parts of the world at no cost and with almost no restrictions

whatsoever. You may copy it, give it away or re-use it under the terms

of the Project Gutenberg License included with this ebook or online

at www.gutenberg.org. If you are not located in the United States,

you will have to check the laws of the country where you are located

before using this eBook.
Title: The Bible, King James version, Book 1: Genesis

Author: Anonymous

Release date: April 1, 2005 [eBook #8001]

                Most recently updated: December 26, 2020
Language: English
Credits: This eBook was produced by David Widger with the help of Derek Andrew's text from January 1992 and the work of Bryan Taylor in November 2002

*** START OF THE PROJE
