# HTML 예시 파일로 실습

In [1]:
from bs4 import BeautifulSoup

In [2]:
html_example = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>BeautifulSoup 활용</title>
</head>
<body>
    <h1 id="heading">Heading 1</h1>
    <p>Paragraph</p>
    <span class="red">BeautifulSoup Library Examples!</span>
    <div id="link">
        <a class="external_link" href="www.google.com">google</a>

        <div id="class1">
            <p id="first">class1's first paragraph</p>
            <a class="external_link" href="www.naver.com">naver</a>

            <p id="second">class1's second paragraph</p>
            <a class="internal_link" href="/pages/page1.html">Page1</a>
            <p id="third">class1's third paragraph</p>
        </div>
    </div>
    <div id="text_id2">
        Example page
        <p>g</p>
    </div>
    <h1 id="footer">Footer</h1>
</body>
</html>
'''

In [3]:
soup = BeautifulSoup(html_example, 'html.parser')

print(soup.title)
print(soup.title.text)
print(soup.title.get_text())

<title>BeautifulSoup 활용</title>
BeautifulSoup 활용
BeautifulSoup 활용


In [4]:
print(soup.head)

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>BeautifulSoup 활용</title>
</head>


In [5]:
print(soup.body)

<body>
<h1 id="heading">Heading 1</h1>
<p>Paragraph</p>
<span class="red">BeautifulSoup Library Examples!</span>
<div id="link">
<a class="external_link" href="www.google.com">google</a>
<div id="class1">
<p id="first">class1's first paragraph</p>
<a class="external_link" href="www.naver.com">naver</a>
<p id="second">class1's second paragraph</p>
<a class="internal_link" href="/pages/page1.html">Page1</a>
<p id="third">class1's third paragraph</p>
</div>
</div>
<div id="text_id2">
        Example page
        <p>g</p>
</div>
<h1 id="footer">Footer</h1>
</body>


In [6]:
print(soup.h1.text)

Heading 1


In [7]:
print(soup.p)

<p>Paragraph</p>


In [8]:
print(soup.find('div'))

<div id="link">
<a class="external_link" href="www.google.com">google</a>
<div id="class1">
<p id="first">class1's first paragraph</p>
<a class="external_link" href="www.naver.com">naver</a>
<p id="second">class1's second paragraph</p>
<a class="internal_link" href="/pages/page1.html">Page1</a>
<p id="third">class1's third paragraph</p>
</div>
</div>


In [9]:
print(soup.find('div', {'id':'text_id2'}))

<div id="text_id2">
        Example page
        <p>g</p>
</div>


In [10]:
a = soup.find('div', {'id':'class1'})
print(a.text)


class1's first paragraph
naver
class1's second paragraph
Page1
class1's third paragraph



In [11]:
href_link = soup.find('a', {'class': 'internal_link'})
href_link = soup.find('a', class_='internal_link')
print(href_link)

<a class="internal_link" href="/pages/page1.html">Page1</a>


In [12]:
print(href_link['href'])

/pages/page1.html


In [13]:
print(href_link.get('href'))

/pages/page1.html


In [14]:
print(href_link.text)

Page1


In [15]:
print(href_link.attrs.values())

dict_values([['internal_link'], '/pages/page1.html'])


In [16]:
values = list(href_link.attrs.values())
print(values)
print(values[0], values[1])

[['internal_link'], '/pages/page1.html']
['internal_link'] /pages/page1.html


In [17]:
href_value = soup.find(attrs={'href':'www.google.com'})
print(href_value.text)

google


In [18]:
span_tag = soup.find('span')

print('span tag:', span_tag)
print('attrs:', span_tag.attrs)
print('values: ', span_tag.attrs['class'])

span tag: <span class="red">BeautifulSoup Library Examples!</span>
attrs: {'class': ['red']}
values:  ['red']


In [19]:
div_tags = soup.find_all('div')
print(div_tags)

[<div id="link">
<a class="external_link" href="www.google.com">google</a>
<div id="class1">
<p id="first">class1's first paragraph</p>
<a class="external_link" href="www.naver.com">naver</a>
<p id="second">class1's second paragraph</p>
<a class="internal_link" href="/pages/page1.html">Page1</a>
<p id="third">class1's third paragraph</p>
</div>
</div>, <div id="class1">
<p id="first">class1's first paragraph</p>
<a class="external_link" href="www.naver.com">naver</a>
<p id="second">class1's second paragraph</p>
<a class="internal_link" href="/pages/page1.html">Page1</a>
<p id="third">class1's third paragraph</p>
</div>, <div id="text_id2">
        Example page
        <p>g</p>
</div>]


In [20]:
print(len(div_tags))
print(div_tags[2])

3
<div id="text_id2">
        Example page
        <p>g</p>
</div>


In [21]:
links = soup.find_all('a')
for alink in links:
    print(alink)
    print('url:{0}, text:{1}'.format(alink['href'], alink.get_text()))
    print()

<a class="external_link" href="www.google.com">google</a>
url:www.google.com, text:google

<a class="external_link" href="www.naver.com">naver</a>
url:www.naver.com, text:naver

<a class="internal_link" href="/pages/page1.html">Page1</a>
url:/pages/page1.html, text:Page1



In [24]:
link_tags = soup.find_all('a', {'class':['external_link', 'internal_link']})
print(link_tags)

[<a class="external_link" href="www.google.com">google</a>, <a class="external_link" href="www.naver.com">naver</a>, <a class="internal_link" href="/pages/page1.html">Page1</a>]


In [27]:
p_tags = soup.find_all('p', {'id': ['first', 'third']})
for p in p_tags:
    print(p)

<p id="first">class1's first paragraph</p>
<p id="third">class1's third paragraph</p>


In [30]:
head = soup.select_one('head')
head

<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>BeautifulSoup 활용</title>
</head>

In [32]:
h1 = soup.select_one('h1')
h1

<h1 id="heading">Heading 1</h1>

In [36]:
head = soup.select_one('#footer')
head

<h1 id="footer">Footer</h1>

In [39]:
class_link = soup.select_one('a.internal_link')
class_link

<a class="internal_link" href="/pages/page1.html">Page1</a>

In [41]:
print(class_link.text)
print(class_link['href'])

Page1
/pages/page1.html
