# Webcrawling

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

attrs는 딕셔너리 형태로 리턴, 속성값을 key로 사용해서 원하는 tag 검색 가능

In [5]:
html_text = '"<span class="red">Heavens! what a virulent attack!</span>"'
text = BeautifulSoup(html_text, 'html.parser')
span_tag = text.find('span')

print(f'span tag\n{span_tag}\n')
print(f'span tag attributes \n{span_tag.attrs}\n')
print(f'span tag values\n{span_tag.attrs["class"]}\n')
print(f'span tag text\n{span_tag.text}\n')

span tag
<span class="red">Heavens! what a virulent attack!</span>

span tag attributes 
{'class': ['red']}

span tag values
['red']

span tag text
Heavens! what a virulent attack!


css 속성을 이용한 tag 검색

In [6]:
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html, 'html.parser')
name_list = bs.find_all('span', class_='green')

for name in name_list:
	print(name.get_text())  # 줄바꿈 문자 '\n' 포함

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


특정 단어 찾을때 find_all(text='검색문자')

In [22]:
prince_list = bs.find_all(text='the prince')
print(f'prince_list length is {len(prince_list)}\n')
print(f'prince_list\n{prince_list}\n')

prince_list length is 0

prince_list
[]



---
### tree 이동

In [26]:
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
print(f'<tr id=gift2> contents\n{bs.find("tr", id="gift2")}\n')

<tr id=gift2> contents
<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>

<tr id=gift2>.previous_siblings is <tr id=gift1> contents


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>




previous_sibling은 찾은 태그의 바로 위의 태그 컨텐츠를 가져온다. id가 gift2인 <tr> tag의 previous_siblings를 하면 그 위에 있는 id가 gift1인 <tr> tag의 컨텐츠를 가져온다.

In [None]:
print('<tr id=gift2>.previous_siblings is <tr id=gift1> contents')
for sibling in bs.find('tr', id='gift2').previous_siblings:
	print(sibling)

어떤 행을 선택하고 next_siblings를 하면 그 테이블의 다음 행들을 모두 가져온다.

In [27]:
print('<table id="giftList">.tr.next_siblings')
for sibling in bs.find('table', id='giftList').tr.next_siblings:
	print(sibling)

<table id="giftList">.tr.next_siblings


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
D

next_siblings가 아니라 next_sibling, previous_sibling하면 태그 하나만 반환하고 문자열 마지막에 whitesplace('\n', '\r')가 있는 경우 해당 문자를 반환한다.

In [28]:
sibling1 = bs.find('tr', id='gift3').next_sibling
print(f'<tr id="gift3">.next_sibling\n{sibling1}')  # '\n'
print(f'<tr id="gift3">.next_sibling -> ord()\n{ord(sibling1)}')

<tr id="gift3">.next_sibling


<tr id="gift3">.next_sibling -> ord()
10


`<tr id='gift3'>`의 next_sibling인 `<tr id='gift4'>`만 가져오고 싶을때 next_sibling.next_sibling을 한다.

In [37]:
sibling2 = bs.find('tr', id='gift3').next_sibling.next_sibling
print(sibling2)

<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parrot! <span class="excitingNote">Or maybe he's only resting?</span>
</td><td>
$0.50
</td><td>
<img src="../img/gifts/img4.jpg"/>
</td></tr>


parent는 해당 요소를 감싸고 있는 박스, 거기서 previous_sibling하면 그 위의 tag 값

In [21]:
img1 = bs.find('img', src='../img/gifts/img1.jpg')
print(f'"../img/gifts/img1.jpg" parent\n{img1.parent}\n')
print(f'"../img/gifts/img1.jpg" parent.previous_sibling\n{img1.parent.previous_sibling}')

"../img/gifts/img1.jpg" parent
<td>
<img src="../img/gifts/img1.jpg"/>
</td>

"../img/gifts/img1.jpg" parent.previous_sibling
<td>
$15.00
</td>


In [40]:
table_children = bs.find('table', id='giftList').children
print('<table id="giftList"> inner contents')
for child in table_children:
	print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [44]:
table_dcd = bs.find('table', id='giftList').descendants
print(f'<table id="giftList"> descendants length is {len(list(table_dcd))}')
for dcd in bs.find('table', id='giftList').descendants:
	print(dcd)

<table id="giftList"> descendants length is 86


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<th>
Item Title
</th>

Item Title

<th>
Description
</th>

Description

<th>
Cost
</th>

Cost

<th>
Image
</th>

Image



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<td>
Vegetable Basket
</td>

Vegetable Basket

<td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td>

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!

<span class="excitingNote">Now with super-colorful bell peppers!</span>
Now with super-colorful bell peppers!


<td>
$15.00
</td>

---
# Regex