In [1]:
from bs4 import BeautifulSoup
html_doc = """<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ; and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
  
 </body>
</html>"""
soup = BeautifulSoup(html_doc, 'html.parser')

# print(soup.prettify())

In [2]:
soup.p.contents

['\n',
 <b>
     The Dormouse's story
    </b>,
 '\n']

In [3]:
soup.p.b.string


"\n    The Dormouse's story\n   "

In [4]:
[x.text for x in soup.head.children if x.name == 'title']

["\n   The Dormouse's story\n  "]

In [5]:
[x.text for x in soup.head.title.next_elements if x.name == 'b']

["\n    The Dormouse's story\n   "]

In [6]:
soup.a.find_all_next

<bound method PageElement.find_all_next of <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>>

In [7]:
[x for x in soup.body]

['\n',
 <p class="title">
 <b>
     The Dormouse's story
    </b>
 </p>,
 '\n',
 <p class="story">
    Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>
    ,
    <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>
    and
    <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>
    ; and they lived at the bottom of a well.
   </p>,
 '\n',
 <p class="story">
    ...
   </p>,
 '\n']

In [8]:
soup.p['class']

['title']

In [9]:
soup.p

<p class="title">
<b>
    The Dormouse's story
   </b>
</p>

In [10]:
soup.findAll('p')

[<p class="title">
 <b>
     The Dormouse's story
    </b>
 </p>,
 <p class="story">
    Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>
    ,
    <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>
    and
    <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>
    ; and they lived at the bottom of a well.
   </p>,
 <p class="story">
    ...
   </p>]

In [11]:
soup.p.find

<bound method Tag.find of <p class="title">
<b>
    The Dormouse's story
   </b>
</p>>

In [12]:
print(soup.find({'b','p'}).prettify())

<p class="title">
 <b>
  The Dormouse's story
 </b>
</p>



In [13]:
soup.p.b.string

"\n    The Dormouse's story\n   "

In [14]:
soup.get_text()

"\n\n\n   The Dormouse's story\n  \n\n\n\n\n    The Dormouse's story\n   \n\n\n   Once upon a time there were three little sisters; and their names were\n   \n    Elsie\n   \n   ,\n   \n    Lacie\n   \n   and\n   \n    Tillie\n   \n   ; and they lived at the bottom of a well.\n  \n\n   ...\n  \n\n"

In [15]:
soup.title

<title>
   The Dormouse's story
  </title>

In [16]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>

In [17]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [18]:
soup.p['class']

['title']

In [19]:
tag = soup.b
type(tag)

bs4.element.Tag

In [20]:
type(soup)

bs4.BeautifulSoup

In [21]:
tag = BeautifulSoup('<b id="boldest">bold</b>', 'html.parser').b
tag['id']

'boldest'

In [22]:
soup.a.attrs


{'class': ['sister'], 'href': 'http://example.com/elsie', 'id': 'link1'}

In [23]:
soup.a['id'] = 'link4'
soup.a.attrs

{'class': ['sister'], 'href': 'http://example.com/elsie', 'id': 'link4'}

In [24]:
soup.a['style'] = 'bold'
soup.a.attrs

{'class': ['sister'],
 'href': 'http://example.com/elsie',
 'id': 'link4',
 'style': 'bold'}

In [25]:
del soup.a['style']
soup.a.attrs

{'class': ['sister'], 'href': 'http://example.com/elsie', 'id': 'link4'}

In [26]:
soup.a.get('id')

'link4'

In [27]:
soup.a['id']

'link4'

In [28]:
soup.a.get_attribute_list('class')

['sister']

In [29]:
from bs4.builder import builder_registry
builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES

{'*': ['class', 'accesskey', 'dropzone'],
 'a': ['rel', 'rev'],
 'link': ['rel', 'rev'],
 'td': ['headers'],
 'th': ['headers'],
 'form': ['accept-charset'],
 'object': ['archive'],
 'area': ['rel'],
 'icon': ['sizes'],
 'iframe': ['sandbox'],
 'output': ['for']}

In [30]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b
tag.string
# 'Extremely bold'
type(tag.string)


bs4.element.NavigableString

In [31]:
tag.string

'Extremely bold'

In [32]:
str(tag.string)

'Extremely bold'

In [33]:
tag.string.replace_with("No longer bold")

'Extremely bold'

In [34]:
doc = BeautifulSoup("<document><content/>INSERT FOOTER HERE</document", "xml")
footer = BeautifulSoup("<footer>Here's the footer</footer>", "xml")
doc.find(text="INSERT FOOTER HERE").replace_with(footer)
# 'INSERT FOOTER HERE'
print(doc)

<?xml version="1.0" encoding="utf-8"?>
<document><content/><footer>Here's the footer</footer></document>


In [35]:
head_tag = soup.head
head_tag = BeautifulSoup("<head><title>The Dormouse's story</title></head>",'html.parser')
type(head_tag.contents[0].contents[0].contents[0])


bs4.element.NavigableString

In [36]:
head_tag.contents[0].text

"The Dormouse's story"

In [37]:
text = head_tag.contents[0]
text.contents

[<title>The Dormouse's story</title>]

In [38]:
for child in head_tag.children:
    print(child)

<head><title>The Dormouse's story</title></head>


In [39]:
text = soup.children
# [x for x in text]

In [40]:
# for child in soup.descendants:
#     print(child)
# loop in loop

In [41]:
len(list(soup.children))


1

In [42]:

len(list(soup.descendants))

2

In [43]:
for string in soup.strings:
    print(repr(string))

'No longer bold'


In [44]:
for string in soup.stripped_strings:
    print(repr(string))

'No longer bold'


In [45]:
# soup.p.string.parent

In [46]:
soup.a.string.parent

AttributeError: 'NoneType' object has no attribute 'string'

In [None]:
link = soup.a
for parent in link.parents:
    print(parent.name)

AttributeError: 'NoneType' object has no attribute 'parents'

In [None]:
soup.prettify()

'<b class="boldest">\n No longer bold\n</b>'

In [None]:
sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>", 'html.parser')
print(sibling_soup.prettify())

<a>
 <b>
  text1
 </b>
 <c>
  text2
 </c>
</a>


In [None]:
print(sibling_soup.b.string)
print(sibling_soup.b.string.next_sibling)

text1
None


In [None]:
for sibling in soup.a.next_siblings:
    print(repr(sibling))

AttributeError: 'NoneType' object has no attribute 'next_siblings'

In [None]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

In [None]:
last_a_tag = soup.find("a", id="link3")

In [None]:
last_a_tag.previous_element.next_element

In [None]:
last_a_tag.next_element

In [None]:
for element in last_a_tag.next_elements:
    print(repr(element))

In [None]:
import re
for tag in soup.find_all(re.compile("^b"),recursive=False):
    print(tag.name)

In [None]:
for tag in soup.find_all(re.compile("^p")):
    print(repr(tag.text))

In [None]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

In [None]:
for tag in soup.find_all(True):
    print(tag.name)

In [None]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

In [None]:
soup.find_all(has_class_but_no_id,recursive=False)

In [None]:
def not_lacie(href):
    return href and not re.compile("lacie").search(href)

soup.find_all(href=not_lacie)

In [None]:
from bs4 import NavigableString
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, NavigableString)
            and isinstance(tag.previous_element, NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print(tag.name)

In [None]:
html_doc

In [None]:
soup.find(string=re.compile("sisters"))

NameError: name 're' is not defined

In [None]:
soup.find_all(id="link2")

In [None]:
soup.find("title")

In [None]:
soup.find_all(id='link2')[0]

In [None]:
soup.find_all(href=re.compile("elsie"))

In [None]:
soup.find_all(id=True)

In [None]:
soup.find_all("a", class_="sister")

In [None]:
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser')
print(name_soup.find_all(name="email"))
# []
print(name_soup.find_all(attrs={"name": "email"}))

In [None]:
soup.find_all(class_=re.compile("itl"))
# [<p class="title"><b>The Dormouse's story</b></p>]

def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)

In [None]:
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.find_all("p", class_="strikeout")

In [None]:
css_soup.find_all("p", class_="strikeout body")

In [None]:
css_soup.find_all("p", class_="body strikeout")

In [None]:
css_soup.select("p.body.strikeout")

In [None]:
(soup.find_all(string=re.compile("Dormouse")))

In [None]:
soup.find_all(string=re.compile("ie"))

In [None]:
def is_the_only_string_within_a_tag(s):
    """Return True if this string is the only child of its parent tag."""
    return (s == s.parent.string)


In [None]:
soup.find_all(string=is_the_only_string_within_a_tag)


In [None]:
soup.find_all("a", text="*Elsie*")

In [None]:
soup.find_all("a", limit=2)

In [None]:
soup.html.find_all("title")

### find_all(): name, attrs, string, limit,

In [None]:
soup = BeautifulSoup(html_doc,'html.parser')

In [None]:
soup.html.find_all("title",recursive=False)

[]

In [None]:
soup.find_all("a")
soup("a")[::-1]

[<a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>]

In [None]:
soup.title.find_all(string=True)
soup.title(string=True)

["\n   The Dormouse's story\n  "]

### find(name, attrs, recursive, string, **kwargs)

In [None]:
soup.find("head").find("title")

<title>
   The Dormouse's story
  </title>

In [None]:
print(soup.find("nosuchtag"))

None


In [None]:
soup.find(string="Lacie")

In [None]:
a_string = soup.find(string="Lacie")

In [None]:
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

first_link.find_next_siblings("a")

[<a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")

<p class="title">
<b>
    The Dormouse's story
   </b>
</p>

In [None]:
soup.find("p", "story")

<p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ; and they lived at the bottom of a well.
  </p>

In [None]:
first_link = soup.a
first_link
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

first_link.find_all_next(string=True)

['\n    Elsie\n   ',
 '\n   ,\n   ',
 '\n    Lacie\n   ',
 '\n   and\n   ',
 '\n    Tillie\n   ',
 '\n   ; and they lived at the bottom of a well.\n  ',
 '\n',
 '\n   ...\n  ',
 '\n',
 '\n']

In [None]:
first_link.find_all_previous("p")

[<p class="story">
    Once upon a time there were three little sisters; and their names were
    <a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>
    ,
    <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>
    and
    <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>
    ; and they lived at the bottom of a well.
   </p>,
 <p class="title">
 <b>
     The Dormouse's story
    </b>
 </p>]

In [None]:
soup.select("title")

[<title>
    The Dormouse's story
   </title>]

In [None]:
soup.select("p:nth-of-type(3)")

[<p class="story">
    ...
   </p>]

In [None]:
soup.select("body a")

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select("html head title")

[<title>
    The Dormouse's story
   </title>]

In [None]:
soup.select("head > title")

[<title>
    The Dormouse's story
   </title>]

In [None]:
soup.select("p > a:nth-of-type(2)")

[<a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>]

In [None]:
soup.select("p > #link1")

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>]

In [None]:
soup.select("body > a")

[]

In [None]:
soup.select("#link1 ~ .sister")

[<a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select("#link1 + .sister")

[<a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>]

In [None]:
soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select('a[href]')

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select('a[href^="http://example.com/"]')

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
     Lacie
    </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select('a[href$="tillie"]')

[<a class="sister" href="http://example.com/tillie" id="link3">
     Tillie
    </a>]

In [None]:
soup.select('a[href*=".com/el"]')

[<a class="sister" href="http://example.com/elsie" id="link1">
     Elsie
    </a>]

In [None]:
soup.select_one(".sister")

<a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>

In [None]:
soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', 'html.parser')
tag = soup.b

tag.name = "blockquote"
del tag['class']
soup

<blockquote>Extremely bold</blockquote>

In [None]:
soup.smooth()

In [None]:
str(soup.p.b)

AttributeError: 'NoneType' object has no attribute 'b'

In [None]:
soup.encode("utf8")

In [None]:
soup.prettify(formatter="minimal")

'<b class="boldest">\n No longer bold\n</b>'

In [None]:
print(soup.encode(formatter="html5"))

b'<b class="boldest">No longer bold</b>'


In [None]:
print(str(soup).replace('\n  ',''))

<b class="boldest">No longer bold</b>


In [None]:
from bs4 import UnicodeDammit
dammit = UnicodeDammit("Sacr\xa0 \n  bleu!")
print(dammit.unicode_markup)

Sacr  
  bleu!


In [None]:
html_doc

'<html>\n <head>\n  <title>\n   The Dormouse\'s story\n  </title>\n </head>\n <body>\n  <p class="title">\n   <b>\n    The Dormouse\'s story\n   </b>\n  </p>\n  <p class="story">\n   Once upon a time there were three little sisters; and their names were\n   <a class="sister" href="http://example.com/elsie" id="link1">\n    Elsie\n   </a>\n   ,\n   <a class="sister" href="http://example.com/lacie" id="link2">\n    Lacie\n   </a>\n   and\n   <a class="sister" href="http://example.com/tillie" id="link3">\n    Tillie\n   </a>\n   ; and they lived at the bottom of a well.\n  </p>\n  <p class="story">\n   ...\n  </p>\n  \n </body>\n</html>'

In [None]:
markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>"
UnicodeDammit(html_doc, ["windows-1252"], smart_quotes_to="ascii").unicode_markup


'<html>\n <head>\n  <title>\n   The Dormouse\'s story\n  </title>\n </head>\n <body>\n  <p class="title">\n   <b>\n    The Dormouse\'s story\n   </b>\n  </p>\n  <p class="story">\n   Once upon a time there were three little sisters; and their names were\n   <a class="sister" href="http://example.com/elsie" id="link1">\n    Elsie\n   </a>\n   ,\n   <a class="sister" href="http://example.com/lacie" id="link2">\n    Lacie\n   </a>\n   and\n   <a class="sister" href="http://example.com/tillie" id="link3">\n    Tillie\n   </a>\n   ; and they lived at the bottom of a well.\n  </p>\n  <p class="story">\n   ...\n  </p>\n  \n </body>\n</html>'

In [None]:
[x for x in soup.stripped_strings]

['No longer bold']

In [None]:
bidding_record = {"Angela": 123, "James": 321}
highest_bid = 0
for bidder in bidding_record:
    bid_amount = int(bidding_record[bidder])
    print(bidder,bid_amount)
    if bid_amount > highest_bid: 
        highest_bid = bid_amount
        winner = bidder
print(f"The winner is {winner} with a bid of ${highest_bid}")

Angela 123
James 321
The winner is James with a bid of $321
