# 3. BeautifulSoup 기초
* HTML과 XML문서를 파싱하기 위한 파이썬 패키지
* 대표페이지 소개

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
requests.__version__

'2.26.0'

In [3]:
html_doc = """
<html>
<head>
<title>My story</title>
</head>
<body>
<p class = "title">My story</p>
<p class="story">내가 좋아하는 음식
<a href="http://www.pizzahut.co.kr" class="food" id="link1">피자</a>
<a href="http://www.kyochon.co.kr" class="food" id="link1">치킨</a>
<a href="http://www.momstouch.co.kr" class="food" id="link1">버거</a>
</p>
</body>
</html>
"""

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')
soup


<html>
<head>
<title>My story</title>
</head>
<body>
<p class="title">My story</p>
<p class="story">내가 좋아하는 음식
<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
<a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>
<a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>
</p>
</body>
</html>

In [5]:
print(soup.prettify())

<html>
 <head>
  <title>
   My story
  </title>
 </head>
 <body>
  <p class="title">
   My story
  </p>
  <p class="story">
   내가 좋아하는 음식
   <a class="food" href="http://www.pizzahut.co.kr" id="link1">
    피자
   </a>
   <a class="food" href="http://www.kyochon.co.kr" id="link1">
    치킨
   </a>
   <a class="food" href="http://www.momstouch.co.kr" id="link1">
    버거
   </a>
  </p>
 </body>
</html>



# find 함수

In [6]:
soup.find('p')

<p class="title">My story</p>

In [7]:
soup.find('a',class_='food',id = 'link3')

In [8]:
type(soup)

bs4.BeautifulSoup

### find_all 함수
* 조건에 맞는 모든 tag를 리스트로 반환

In [9]:
soup


<html>
<head>
<title>My story</title>
</head>
<body>
<p class="title">My story</p>
<p class="story">내가 좋아하는 음식
<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
<a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>
<a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>
</p>
</body>
</html>

In [10]:
soup.find_all('p')

[<p class="title">My story</p>,
 <p class="story">내가 좋아하는 음식
 <a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
 <a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>
 <a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>
 </p>]

In [11]:
soup.find_all('a')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>]

In [12]:
soup.find_all('a',id='link1')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>]

In [13]:
for tag in soup.find_all('a'):
  print(tag)

<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>
<a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>
<a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>


### get_text 함수
* tag안의 value를 추출
* 부모 tag

In [14]:
soup.get_text()

'\n\n\nMy story\n\n\nMy story\n내가 좋아하는 음식\n피자\n치킨\n버거\n\n\n\n'

In [15]:
print(soup.prettify())

<html>
 <head>
  <title>
   My story
  </title>
 </head>
 <body>
  <p class="title">
   My story
  </p>
  <p class="story">
   내가 좋아하는 음식
   <a class="food" href="http://www.pizzahut.co.kr" id="link1">
    피자
   </a>
   <a class="food" href="http://www.kyochon.co.kr" id="link1">
    치킨
   </a>
   <a class="food" href="http://www.momstouch.co.kr" id="link1">
    버거
   </a>
  </p>
 </body>
</html>



In [17]:
soup.find('p').get_text()

'My story'

In [19]:
soup.find_all('p').get_text()

AttributeError: ResultSet object has no attribute 'get_text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [20]:
for tag in soup.find_all('a'):
  print(tag.get_text())

피자
치킨
버거


### attribute값 출력하기
* 검색한 tag에서 attribute값을 추출
* tag['attri명']

In [23]:
soup.find('a').attrs

{'href': 'http://www.pizzahut.co.kr', 'class': ['food'], 'id': 'link1'}

In [24]:
soup.find('p')['class']

['title']

In [28]:
for tag in soup.find_all('a'):
  print(tag['href'])

http://www.pizzahut.co.kr
http://www.kyochon.co.kr
http://www.momstouch.co.kr


### select 함수
* select는 CSS Selector로 tag 찾기
* 자손 태그 찾기 - tag1 tag2
* 직계 자식 태그 찾기 - tag1 > tag2
* id 선택자 - #id
* class 선택지 - .class
* 속성값 찾기 - [name = 'value']
  * 속성값 prefix 찾기 [name ^='vlaue']
  * 속성값 suffix 찾기 [name $= 'value']
  * 속성값 포함문자열 찾기 [name *= 'value']

In [29]:
soup.select_one('p')

<p class="title">My story</p>

In [30]:
soup.select('html title')

[<title>My story</title>]

In [31]:
# id 선택자
soup.select('#link1')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>]

In [32]:
# class 선택자
soup.select('.title')

[<p class="title">My story</p>]

In [33]:
# 속성값 찾기
soup.select('[href="http://www.pizzahut.co.kr"]')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>]

In [40]:
# 종료문자열
soup.select('[href$="kr"]')

[<a class="food" href="http://www.pizzahut.co.kr" id="link1">피자</a>,
 <a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>]

In [38]:
soup.select('[href*="ch"]')

[<a class="food" href="http://www.kyochon.co.kr" id="link1">치킨</a>,
 <a class="food" href="http://www.momstouch.co.kr" id="link1">버거</a>]