In [6]:
# Loading Library
import requests
from bs4 import BeautifulSoup

### Send HTTP GET Request

In [7]:
# Use 'requests' library to send an HTTP GET request 
# Send request to the 'travel' subreddit
url = 'https://old.reddit.com/r/travel/'

# Headers to mimic a browser visit
headers = {'User-Agent': 'Mozilla/5.0'}

# Returns a requests.models.Response object 
response = requests.get(url, headers=headers)

In [8]:
# Create BeautifulSoup object that is created by taking string of raw source code - need to specify html parser
# (as BeautifulSoup can create soup out of XML)
soup = BeautifulSoup(response.text, 'html.parser')

### Understanding BeautifulSoup (from https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

In [9]:
# View 'prettified' html of website
print(soup.prettify())

<!DOCTYPE html>
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <title>
   travel
  </title>
  <meta content=" reddit, reddit.com, vote, comment, submit " name="keywords"/>
  <meta content="r/travel is a community about exploring the world. Your pictures, questions, stories, or any good content is welcome. Clickbait, spam, memes,..." name="description"/>
  <meta content="always" name="referrer"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
   <link href="/static/opensearch.xml" rel="search" type="application/opensearchdescription+xml"/>
   <link href="https://www.reddit.com/r/travel/" rel="canonical"/>
   <meta content="width=1024" name="viewport"/>
   <link href="//out.reddit.com" rel="dns-prefetch"/>
   <link href="//out.reddit.com" rel="preconnect"/>
   <meta content="https://styles.redditmedia.com/t5_2qh41/styles/communityIcon_x4pa3xf5z4d11.jpg" property="og:image"/>
   <meta content="reddit" property="og:site_name"/>
   <meta

In [10]:
# Find the title of the Soup
print(soup.title)

# Find the first tag by the name 'div'
print(soup.div)
# other options include soup.head, soup.body, soup.a (first tag by that name). soup.div, soup.p etc

<title>travel</title>
<div class="ad adsense-ad adsense-ads googad googads gemini-ad openx ad-banner ad-BANNER GoogleAd googleAd hasads LeftAd native-ad ad-300-250 adbar ads-area HeaderAd NavBarAd ad-medium post-ad promoad rectad sidebar-ad small-ad sponsorAd sponsorPost" id="adblock-test"></div>


In [11]:
# Get ALL <a> tags
soup.find_all('a')

[<a href="#content" id="jumpToContent" tabindex="1">jump to content</a>,
 <a class="bottom-option choice" href="https://old.reddit.com/subreddits/">edit subscriptions</a>,
 <a class="choice" href="https://old.reddit.com/r/popular/">popular</a>,
 <a class="choice" href="https://old.reddit.com/r/all/">all</a>,
 <a class="random choice" href="https://old.reddit.com/r/random/">random</a>,
 <a class="choice" href="https://old.reddit.com/users/">users</a>,
 <a class="choice" href="https://old.reddit.com/r/AskReddit/">AskReddit</a>,
 <a class="choice" href="https://old.reddit.com/r/gaming/">gaming</a>,
 <a class="choice" href="https://old.reddit.com/r/pics/">pics</a>,
 <a class="choice" href="https://old.reddit.com/r/funny/">funny</a>,
 <a class="choice" href="https://old.reddit.com/r/news/">news</a>,
 <a class="choice" href="https://old.reddit.com/r/worldnews/">worldnews</a>,
 <a class="choice" href="https://old.reddit.com/r/mildlyinteresting/">mildlyinteresting</a>,
 <a class="choice" href=

#### Important: BeautifulSoup object has many layers (see output from .prettify()) - this introduces the concept of children and descendants

In [12]:
# Exploring the header tag (i.e. soup.head)
# .contents AND .children
head_tag = soup.head

# First element of soup.head is the title
title_tag = head_tag.contents[0]

# View the contents of the title tag
title_tag.contents

# title_tag is a BeautifulSoup object, so has an associated attribute 'contents'

# .contents and .children attributes only consider a tag's direct children
for child in title_tag.children:
    print(f'title_tag child(ren): {child}')

 # title_tag also only has 1 descendant
for child in title_tag.descendants:
    print(f'title_tag descendant(s): {child}')


title_tag child(ren): travel
title_tag descendant(s): travel


In [13]:
# What about for the fifth element?
# Fifth element of soup.head looks like multiple icons (twitter, apple etc)
misc_tag = head_tag.contents[4]


# misc_tag is a BeautifulSoup object, so has an associated attribute 'contents'

# .contents and .children attributes only consider a tag's direct children
# misc_tag has more children than the title_tag
print(f'misc tag children: {len(list(misc_tag.children))}')

# misc_tag has many more descendants than children
print(f'misc tag descendants: {len(list(misc_tag.descendants))}')

misc tag children: 17
misc tag descendants: 55


In [14]:
# The higher you go, the more descendants you will have
# For example, you can explore how many .descendants the head_tag has:
print(f'head_tag descendants:{len(list(head_tag.descendants))}')

head_tag descendants:61


In [15]:
# If a tag only has one child, this is made available as .string
title_tag.string

'travel'

In [16]:
# If a tag has multiple children, it will not automatically convert to string, UNLESS that child only has a single tag
# Then this tag is inherited
# Can iterate over the head tag to print tags associated with each child
for tag in head_tag:
    print(tag.string)
    
# Note only the first tag prints a string, as this itself only have one child (the other tags have multiple children)
# If a tag contains more than one thing, then it’s not clear what .string should refer to
# This causes .string is defined to be None

travel
None
None
None
None


In [17]:
# If this happens, can look at .strings
for string in soup.strings:
    print(repr(string))
    '\n'
    
# Can remove extra whitespaces by using .stripped_strings

'travel'
'jump to content'
'my subreddits'
'edit subscriptions'
'popular'
'-'
'all'
'-'
'random'
'-'
'users'
'\xa0|\xa0'
'AskReddit'
'-'
'gaming'
'-'
'pics'
'-'
'funny'
'-'
'news'
'-'
'worldnews'
'-'
'mildlyinteresting'
'-'
'movies'
'-'
'todayilearned'
'-'
'explainlikeimfive'
'-'
'aww'
'-'
'videos'
'-'
'OldSchoolCool'
'-'
'TwoXChromosomes'
'-'
'tifu'
'-'
'dataisbeautiful'
'-'
'LifeProTips'
'-'
'science'
'-'
'books'
'-'
'Jokes'
'-'
'Showerthoughts'
'-'
'Music'
'-'
'space'
'-'
'UpliftingNews'
'-'
'askscience'
'-'
'IAmA'
'-'
'Futurology'
'-'
'gifs'
'-'
'sports'
'-'
'gadgets'
'-'
'nosleep'
'-'
'food'
'-'
'history'
'-'
'announcements'
'-'
'InternetIsBeautiful'
'-'
'WritingPrompts'
'-'
'GetMotivated'
'-'
'philosophy'
'-'
'Documentaries'
'-'
'creepy'
'-'
'EarthPorn'
'-'
'photoshopbattles'
'-'
'nottheonion'
'-'
'listentothis'
'-'
'blog'
'-'
'DIY'
'more »'
'\xa0'
'travel'
'hot'
'new'
'rising'
'controversial'
'top'
'wiki'
'Want to join? '
'Log in'
' or '
'sign up'
' in seconds.'
'|'
'English'
'l

In [18]:
# Going up - every tag and string has a parent - i.e. the tag that contains it
print(f'{title_tag}\n')
title_tag.parent

<title>travel</title>



<head><title>travel</title><meta content=" reddit, reddit.com, vote, comment, submit " name="keywords"/><meta content="r/travel is a community about exploring the world. Your pictures, questions, stories, or any good content is welcome. Clickbait, spam, memes,..." name="description"/><meta content="always" name="referrer"/><meta content="text/html; charset=utf-8" http-equiv="Content-Type"><link href="/static/opensearch.xml" rel="search" type="application/opensearchdescription+xml"/><link href="https://www.reddit.com/r/travel/" rel="canonical"/><meta content="width=1024" name="viewport"/><link href="//out.reddit.com" rel="dns-prefetch"/><link href="//out.reddit.com" rel="preconnect"/><meta content="https://styles.redditmedia.com/t5_2qh41/styles/communityIcon_x4pa3xf5z4d11.jpg" property="og:image"/><meta content="reddit" property="og:site_name"/><meta content="r/travel is a community about exploring the world. Your pictures, questions, stories, or any good content is welcome. Clickbait, 

In [19]:
# The parent of a top-level tag like <html> is the BeautifulSoup object itself
html_tag = soup.html
print(type(html_tag.parent))

# And the .parent of a BeautifulSoup is defined as None
print(soup.parent)

<class 'bs4.BeautifulSoup'>
None


In [20]:
# Can iterate over all of an element's parents with .parents
link = soup.a
print(f'link: {link}')

for parent in link.parents:
    print(parent.name)
    
# Travels from an <a> tag within the document to the very top of the document

link: <a href="#content" id="jumpToContent" tabindex="1">jump to content</a>
div
body
html
[document]


In [21]:
# Lastly the concept of siblings
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></a>", 'html.parser')
print(sibling_soup.prettify())

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>



In [22]:
# <b> and <c> are siblings here, because they share <a> as a parent, and are of the same indentation

In [23]:
# This returns the first element with a <p> tag
print(f'{soup.p} \n')

# We can navigate between siblings using .next_sibling and .previous_siblings (if they exist)
print(f'{soup.p.next_sibling} \n')

# The first element with a <p> tag does not have a previous sibling
print(f'{soup.p.previous_sibling} \n')

# Can then find the next, next sibling:
print(f'{soup.p.next_sibling.next_sibling} \n')

<p>use the following search parameters to narrow your results:</p> 

<dl><dt>subreddit:<i>subreddit</i></dt><dd>find submissions in "subreddit"</dd><dt>author:<i>username</i></dt><dd>find submissions by "username"</dd><dt>site:<i>example.com</i></dt><dd>find submissions from "example.com"</dd><dt>url:<i>text</i></dt><dd>search for "text" in url</dd><dt>selftext:<i>text</i></dt><dd>search for "text" in self post contents</dd><dt>self:yes (or self:no)</dt><dd>include (or exclude) self posts</dd><dt>nsfw:yes (or nsfw:no)</dt><dd>include (or exclude) results marked as NSFW</dd></dl> 

None 

<p>e.g. <code>subreddit:aww site:imgur.com dog</code></p> 



In [24]:
# Can find all next siblings using .next_siblings
for sibling in soup.p.next_siblings:
    print(f'{repr(sibling)}\n')

<dl><dt>subreddit:<i>subreddit</i></dt><dd>find submissions in "subreddit"</dd><dt>author:<i>username</i></dt><dd>find submissions by "username"</dd><dt>site:<i>example.com</i></dt><dd>find submissions from "example.com"</dd><dt>url:<i>text</i></dt><dd>search for "text" in url</dd><dt>selftext:<i>text</i></dt><dd>search for "text" in self post contents</dd><dt>self:yes (or self:no)</dt><dd>include (or exclude) self posts</dd><dt>nsfw:yes (or nsfw:no)</dt><dd>include (or exclude) results marked as NSFW</dd></dl>

<p>e.g. <code>subreddit:aww site:imgur.com dog</code></p>

<p><a href="https://www.reddit.com/wiki/search">see the search faq for details.</a></p>



In [25]:
# Last can use .next_element or previous_element
print(f'First element with a <p> tag:\n{soup.p}\n')
print(f'Previous element:\n {soup.p.previous_element}\n')
print(f'Next, next element:\n {soup.p.next_element.next_element}\n')

First element with a <p> tag:
<p>use the following search parameters to narrow your results:</p>

Previous element:
 <div id="moresearchinfo"><p>use the following search parameters to narrow your results:</p><dl><dt>subreddit:<i>subreddit</i></dt><dd>find submissions in "subreddit"</dd><dt>author:<i>username</i></dt><dd>find submissions by "username"</dd><dt>site:<i>example.com</i></dt><dd>find submissions from "example.com"</dd><dt>url:<i>text</i></dt><dd>search for "text" in url</dd><dt>selftext:<i>text</i></dt><dd>search for "text" in self post contents</dd><dt>self:yes (or self:no)</dt><dd>include (or exclude) self posts</dd><dt>nsfw:yes (or nsfw:no)</dt><dd>include (or exclude) results marked as NSFW</dd></dl><p>e.g. <code>subreddit:aww site:imgur.com dog</code></p><p><a href="https://www.reddit.com/wiki/search">see the search faq for details.</a></p></div>

Next, next element:
 <dl><dt>subreddit:<i>subreddit</i></dt><dd>find submissions in "subreddit"</dd><dt>author:<i>username</

#### Search the Tree

In [26]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

test_soup = BeautifulSoup(html_doc, 'html.parser')

# Method signature: find_all(name, attrs, recursive, string, limit, **kwargs)
# Use find_all to find specific tags
print("All <a> tags \n")
print(test_soup.find_all('a'))
print("\n")
print("All <p> tags \n")
print(test_soup.find_all('p'))
print("\n")
# note finding <p> tags keep <a> tags (as they are children)

# Alternativel you can pass a regular express - this will filter using BeautifulSoups internal search() method
# Find all tags that start with the letter "b"
import re
for tag in test_soup.find_all(re.compile("^b")):
    print(tag.name)
# i.e. <body> and <b>
print("\n")

# All tags whose names contain the letter t
for tag in test_soup.find_all(re.compile("t")):
    print(tag.name)   

All <a> tags 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


All <p> tags 

[<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, <p class="story">...</p>]


body
b


html
title


In [23]:
# Can also pass a list to filter for multiple tags
test_soup.find_all(["a","b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [25]:
# To find ALL tags (but no text strings)
for tag in test_soup.find_all(True):
    print(tag.name)


html
head
title
body
p
b
p
a
a
a
p


In [26]:
# You can also define a function, and pass this through find_all
def has_class_but_no_id(tag):
    # Return all tags with attribute 'class' but not 'id'
    return tag.has_attr('class') and not tag.has_attr('id')

test_soup.find_all(has_class_but_no_id)

# only picks up <p> tags -<html> and <title> tags don't define 'class'
# also doesn't pick up <a> tags individually (separate from <p> tags) because these define both as "class" and "id"

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [35]:
# Can find based on name and attributes
print(test_soup.find_all("p","title"))
print("\n")

print(test_soup.find_all(id="link2"))
print("\n")

# Or alternatively use regular expressions to filter based on text
print(test_soup.find_all(string=re.compile("sisters")))

[<p class="title"><b>The Dormouse's story</b></p>]


[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]


['Once upon a time there were three little sisters; and their names were\n']


In [37]:
# Alternatively use booleans to return complete sets
print(test_soup.find_all(id=True)) # return all with an id attribute, regardless of its value
print("\n")
# Can pass multiple attributes at once
print(test_soup.find_all(href=re.compile("elsie"), id="link1"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]


[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]


In [42]:
# Sometimes not always possible to use a keyword argument
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser')
print(name_soup.find_all(name="email"))
print("\n")
print(name_soup.find_all(attrs={"name": "email"}))

[]


[<input name="email"/>]


In [45]:
# Can also use the string tag to filter for particular strings
print(test_soup.find_all("a", string="Elsie"))

# Return strings that are the only child of a parent tag
def is_the_only_string_within_a_tag(s):
    """Return True if this strings is the only child of its parent tag"""
    return (s == s.parent.string)

print(test_soup.find_all(string=is_the_only_string_within_a_tag))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
["The Dormouse's story", "The Dormouse's story", 'Elsie', 'Lacie', 'Tillie', '...']


In [46]:
# Can also limit the number of returns
test_soup.find_all("a",limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [48]:
# If you use the .find_all() function, BeautifulSoup will examione all of its descendants: its children, its children's children etc.
# If you only want to consider direct children, can pass recursive=False
print(test_soup.find_all("title"))
print(test_soup.find_all("title", recursive=False))

[<title>The Dormouse's story</title>]
[]


#### Modifying the Tree

In [49]:
# You can rename a tag, change the values of its attributes, add new attributes, delete attributes
rename_soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = rename_soup.b

tag.name = "blockquote"
tag['class'] = 'verybold'
tag['id'] = 1
print(tag)
# <blockquote class="verybold" id="1">Extremely bold</blockquote>

del tag['class']
del tag['id']
print(tag)
# <blockquote>Extremely bold</blockquote>

<blockquote class="verybold" id="1">Extremely bold</blockquote>
<blockquote>Extremely bold</blockquote>


In [54]:
# Can modify the .string attribute
markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
string_soup = BeautifulSoup(markup, 'html.parser')

tag = string_soup.a
tag.string = "New link text."
print(tag)

<a href="http://example.com/">New link text.</a>


In [70]:
# Append, Extend, NavigableString and new_tag
foo_soup = BeautifulSoup("<a>Foo</a>", 'html.parser')
foo_soup.a.append("Bar")
print(foo_soup)

print(foo_soup.a.contents)

foo_soup.a.extend(["'s"," ","on"])
print(foo_soup)
print(foo_soup.a.contents)

from bs4 import NavigableString
nav_soup = BeautifulSoup("<b></b>", 'html.parser')
tag = nav_soup.b
tag.append("Hello there")
new_string = NavigableString(" now")
tag.append(new_string)
print(tag)
print(tag.contents)

from bs4 import Comment
new_comment = Comment("Nice to see you.")
tag.append(new_comment)
print(tag)
print(tag.contents)

# Introduce new <a> tag
original_tag = nav_soup.b
new_tag = nav_soup.new_tag("a", href="http://www.example.com")
original_tag.append(new_tag)
print(original_tag)

new_tag.string = "Link text."
print(original_tag)


<a>FooBar</a>
['Foo', 'Bar']
<a>FooBar's on</a>
['Foo', 'Bar', "'s", ' ', 'on']
<b>Hello there now</b>
['Hello there', ' now']
<b>Hello there now<!--Nice to see you.--></b>
['Hello there', ' now', 'Nice to see you.']
<b>Hello there now<!--Nice to see you.--><a href="http://www.example.com"></a></b>
<b>Hello there now<!--Nice to see you.--><a href="http://www.example.com">Link text.</a></b>


### Using BeautifulSoup with Reddit

In [45]:
# Domain class represents posts - can explore 
# 'find_all' returns a list of objects meeting the criteria
# using just 'find' would return only one object
domains = soup.find_all('span', class_="domain")

# Find the different posts on this subreddit
attrs = {'class': 'thing', 'data-domain': 'self.travel'}

posts = soup.find_all('div', attrs=attrs)


In [48]:
# In HTML you have different tags 
#<div> is designed to describe a container of data
#<p> is designed to describe a paragraph of context
# As Reddit posts are have <p> tags, you can look for paragraph tabs with the class "title"
for post in posts:
    print(post.find('p', class_="title").text)

QuestionPassport Questions & Issues Megathread (2023) (self.travel)
AMAPlanning on taking a trip to a National Park? We’re Washington Post reporters, who have covered parks across the country. Ask us anything! (self.travel)
QuestionQuitting hostels?! (self.travel)
My AdviceI just got back to the States from traveling around Europe for 6 weeks with my wife and 1.5yo son. Here is what I learned. (self.travel)
QuestionWhat single item has hugely improved your travels? (self.travel)
QuestionWhat non-beach city would you recommend in Central/South America? (self.travel)
QuestionDo you take an extra day to recover from jet lag when you get home? (self.travel)
QuestionHow do I inspire my partner to want to travel? (self.travel)
ItineraryOne Country in Asia - Suggestions? (self.travel)
QuestionBest Arabic country for couples travel? (self.travel)
ItineraryHow is solo female travel in Jordan? (self.travel)
Third Party Horror StoryHave You Been Scammed By FlightHub? Here's My Story. (self.travel

In [34]:
# If you inspect the HTML code on 'https://old.reddit.com/r/travel/', you can see the class 'title' is 4 levels below the top level
for domain in soup.find_all("span", class_="domain"):
    if domain != "(self.travel)":
        continue

    parent_div = domain.parent.parent.parent.parent
    print(parent_div.text)

In [24]:
# Find all elements with class attributes
elements_with_classes = soup.find_all(class_=True)

# Extract and print the class values
for element in elements_with_classes:
    classes = element['class']
    print("Classes:", classes)

Classes: ['listing-page', 'hot-page']
Classes: ['ad', 'adsense-ad', 'adsense-ads', 'googad', 'googads', 'gemini-ad', 'openx', 'ad-banner', 'ad-BANNER', 'GoogleAd', 'googleAd', 'hasads', 'LeftAd', 'native-ad', 'ad-300-250', 'adbar', 'ads-area', 'HeaderAd', 'NavBarAd', 'ad-medium', 'post-ad', 'promoad', 'rectad', 'sidebar-ad', 'small-ad', 'sponsorAd', 'sponsorPost']
Classes: ['width-clip']
Classes: ['dropdown', 'srdrop']
Classes: ['selected', 'title']
Classes: ['drop-choices', 'srdrop']
Classes: ['bottom-option', 'choice']
Classes: ['sr-list']
Classes: ['flat-list', 'sr-bar', 'hover']
Classes: ['choice']
Classes: ['separator']
Classes: ['choice']
Classes: ['separator']
Classes: ['random', 'choice']
Classes: ['separator']
Classes: ['choice']
Classes: ['separator']
Classes: ['flat-list', 'sr-bar', 'hover']
Classes: ['choice']
Classes: ['separator']
Classes: ['choice']
Classes: ['separator']
Classes: ['choice']
Classes: ['separator']
Classes: ['choice']
Classes: ['separator']
Classes: ['cho