In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
html = urlopen('http://pythonscraping.com/pages/page1.html')

In [3]:
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


In [4]:
bs.body

<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>

### 오류를 처리해야 할 때는?

In [5]:
# from urllib.request import urlopen
from urllib.request import HTTPError
from urllib.request import URLError

try : 
    #html = urlopen('https://pythonscrapingthisrurldoesnotexist.com')
    html = urlopen("http://pythonscraping.com/pages/page1.html")
except HTTPError as e :
    print(e)
except URLError as e :
    print("The server could not be found!")
else :
    print('It Worked')

It Worked


### Tag가 제대로 가져오지 못하거나, None인 경우 처리는?

In [6]:
def getTitle(url) :
    try :
        html = urlopen(url)
    except HTTPError as e :
        return None
    except URLError as e2:
        return None
    try :
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    
    return title

title = getTitle('https://pythonscrapingthisrurldoesnotexist.com')
if title == None:
    print('Title could not be found')
else:
    print(title)
        

Title could not be found


### CSS 이용법

* CSS는 HTML 요소를 구분해서 서로 다른 스타일을 적용
* span 태그에 CSS 클래스가 붙어져있는 예제를 살펴보자

In [7]:
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html, 'html.parser')

nameList = bs.findAll('span', {'class':'green'})
for name in nameList:
    print(name.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


* 여러 개의 태그 list도 넘길 수 있음

In [8]:
bs.findAll({'h1', 'h2', 'h3', 'h4', 'h5'})

[<h1>War and Peace</h1>, <h2>Chapter 1</h2>]

* 둘 중 하나라도 일치하는 것을 찾고 싶다면

In [9]:
bs.findAll('span', {'class':{'green', 'red'}})

[<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of the
 Buonapartes. But I warn you, if you don't tell me that this means war,
 if you still try to defend the infamies and horrors perpetrated by
 that Antichrist- I really believe he is Antichrist- I will have
 nothing more to do with you and you are no longer my friend, no longer
 my 'faithful slave,' as you call yourself! But how do you do? I see
 I have frightened you- sit down and tell me all the news.</span>,
 <span class="green">Anna
 Pavlovna Scherer</span>,
 <span class="green">Empress Marya
 Fedorovna</span>,
 <span class="green">Prince Vasili Kuragin</span>,
 <span class="green">Anna Pavlovna</span>,
 <span class="green">St. Petersburg</span>,
 <span class="red">If you have nothing better to do, Count [or Prince], and if the
 prospect of spending an evening with a poor invalid is not too
 terrible, I shall be very charmed to see you tonight between 7 and 10-
 Annette Scherer.</span>,
 <span clas

* findAll(tag, attributes, recursive, text, limit, keywords)
    * recursive : True이면 findAll 함수가 매개변수 일치하는 태그를 모두 찾고, False이면 최상위 태그만 찾음
    * text : 일치하는 텍스트만 갖고 옴
    * limit: findAll에서만 쓰이고, find함수는 findAll 호출하면서 limit을 1로 지정한 것 과 같음    
    * keyword: 특정 속성이 포함된 태그를 선택할 때 사용
 

In [13]:
nameList = bs.findAll(text = 'the prince')
print(len(nameList))

7


In [14]:
title = bs.findAll(id='title', class_='text')
print(title)

[]


### 2.2.2 기타 BeautifulSoup 객체

* BeautifulSoup 객체: bs
* Tag 객체: bs.div.h1
* 이외 NavigableString, Comment 객체 존재

### 2.2.3 트리 이동

* findAll 함수는 이름과 속성에 따라 태그 찾음. But, 문서 안 위치 기준으로 태그를 찾을 때는?

In [17]:
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for child in bs.find('table', {'id':'giftList'}).children :
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [18]:
# 형제 다루기

for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [20]:
# 부모 다루기

print(bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())


$15.00



* 위 부모 다루기에서 $15.00이 나온 과정은
    1. 먼저 src = '../img/gifts/img1.jpg'에서 해당하는 이미지 선택
    2. 부모 태그 선택 
    3. 2번에서 선택한 <td> 태그의 previous_sibling 선택 (가격이 들어가 있는 td 태그)
    4. 태그에 들어있는 텍스트인 15.00 선택

## 2.3 정규표현식

* 정규표현식 test site: http://regexpal.com

* 예를 들어 이메일을 만드는 규칙을 아래와 같이 설정하는 정규표현식을 만든다면 어떨까?
    1. 이메일 주소의 첫 번째 부분에는 다음 중 최소한 하나가 포함돼야 합니다 (대문자, 소문자, 숫자, 마침표/플러스/밑줄 기호)
    2. 그 다음에 @이 나와야 합니다.
    3. 그 다음에 반드시 대문자, 소문자가 최소한 하나 있어야 합니다.
    4. 그 다음에 마침표가 옵니다.
    5. 마지막으로 이메일 주소는 com, org, edu, net 중 하나로 끝나야 합니다.

* 정답: [A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net)

## 2.4 정규표현식과 BeautifulSoup

* 제품 이미지를 갖고 온다고 할 때, 단순히 .findAll('img') 면 모든 이미지를 갖고 온다.
* 페이지 레이아웃이 바뀔 수도 있어서 이미지가 차지하는 위치를 정확한 태그 근거로 삼는 게 어려울 수도 있다.
* 원하는 이미지 유형의 태그 자체를 식별하는 무언가를 찾는 것. (예: 제품 이미지 파일 경로로 find 가능)

In [32]:
import re

images = bs.findAll('img', {'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')})
for image in images:
    print(image['src'])
    
    # 응용을 하면, 아래와 같이 image 별로 가격을 갖고 올 수 있음
    #print(bs.find('img', {'src':image['src']}).parent.previous_sibling.get_text())

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


## 2.5 속성에 접근하기

* 태그 자체의 속성에 관심이 있을 경우 .attrs로 접근 가능

In [43]:
print(image)
print(image.attrs)

<img src="../img/gifts/img6.jpg"/>
{'src': '../img/gifts/img6.jpg'}


## 2.6 람다 표현식

* BeautifulSoup에서는 특정 타입 함수를 findAll 함수에 매개변수로 넘길 수 있음
* 해당 함수는 '태그'객체를 매개변수로 받고, '불리언'만 반환
* BeautifulSoup 에서는 모든 태그 객체 이 함수에서 평가하고 True 평가 태그는 반환 아닌 태그는 버림

In [48]:
bs.findAll(lambda tag : len(tag.attrs) == 2)

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td>

In [53]:
print(bs.findAll(lambda tag: tag.get_text() == 'Totally Normal Gifts'))
print(bs.findAll('',text='Totally Normal Gifts'))

[<h1>Totally Normal Gifts</h1>]
['Totally Normal Gifts']
