# 데이터 크롤링과 정제
## HTML 분석 및 정규식


## HTML 분석 개념
- 복잡한 웹페이지에서 필요한 정보 가져오기
    - 원하지 않는 콘텐츠 제거
    
    - 원하는 정보는 다양한 곳에 존재
        - 페이지 타이틀
        - 페이지 URL
        - 원하는 정보가 정형화 되어 있지 않는 경우, 문제 발생

### HTML 분석
- https://www.pythonscraping.com/pages/warandpeace.html
        - 등장인물 대사: 빨간색
        - 등장인물 이름: 녹색
            
- <span>: 인라인 요소들을 하나로 묶을 때 사용
- <span>은 줄 바꿈 안됨

## CSS 속성을 이용한 검색
- 속성(attrs) 사용
    - find() 함수에 이름, 속성, 속성값을 이용하여 원하는 태그를 검색
        - 맨 처음 검색 결과만 리턴

In [2]:
from bs4 import BeautifulSoup

html_text='"<span class="red">Heavens! what a virulent attack!</span>"'
soup=BeautifulSoup(html_text, 'html.parser')

object_tag=soup.find('span')
print('object_tag:', object_tag)
print('attrs:', object_tag.attrs)  # attrs: 딕셔너리 형태로 리턴
print('value:', object_tag.attrs['class'])
print('text:', object_tag.text)

object_tag: <span class="red">Heavens! what a virulent attack!</span>
attrs: {'class': ['red']}
value: ['red']
text: Heavens! what a virulent attack!


In [3]:
# CSS 속성을 이용한 태그 검색
# get_text(): tag를 제외한 텍스트만 반환
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs=BeautifulSoup(html, "html.parser")

nameList=bs.find_all('span', {'class': 'green'})  # 등장인물의 이름은 모두 녹색, <span class='green'> ... </span> 모두 검색
for name in nameList:
    print(name.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


## 특정 단어 찾기
- find_all(text='검색어')
    - 대소문자 구분
    - 검색어:'the prince'
    

In [6]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs=BeautifulSoup(html, "html.parser")

princeList=bs.find_all(text='the prince')
print(princeList)
print('the prince count:', len(princeList))

['the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince', 'the prince']
the prince count: 7


## 트리 이동
- 트리 이동
    - 문서 내부에서 특정 위치를 기준으로 태그를 찾을 때
    - 단방향으로 트리 이동
    - bs.tag.subTag.anotherSubTag
- 온라인 쇼핑 사이트 구성 및 트리 이동
    - https://www.pythonscraping.com/pages/page3.html

## 온라인 쇼핑몰 테이블 구성 현황

In [9]:
# 트리 이동: 자식과 자손
# 자식: children
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen('http://www.pythonscraping.com/pages/page3.html')
bs=BeautifulSoup(html, 'html.parser')

table_tag=bs.find('table',{'id':'giftList'})
for child in table_tag.children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [10]:
# 트리 이동: 자손
# 자손: descendants
desc=bs.find('table', {'id':'giftList'}).descendants
print('descendants 개수:', len(list(desc)))

descendants 개수: 86


In [12]:
for child in bs.find('table',{'id':'giftList'}).descendants:
    print(child)  # <tr> 분리    <th> 분리    <th> 내부 분리



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
<th>
Item Title
</th>

Item Title

<th>
Description
</th>

Description

<th>
Cost
</th>

Cost

<th>
Image
</th>

Image



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
<td>
Vegetable Basket
</td>

Vegetable Basket

<td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td>

This vegetable basket is the perfect gift for your health conscious (or overweight) friends!

<span class="excitingNote">Now with super-colorful bell peppers!</span>
Now with super-colorful bell peppers!


<td>
$15.00
</td>

$15.00

<td>
<img src="../img/gifts/img1.jpg"

## 트리 이동: 형제 다루기
- 형제: next_siblings 속성
    - 임의의 행을 선택하고 next_siblings을 선택하면,
        - 테이블의 다음 행들을 모두 선택 (모든 형제를 선택)
       

In [13]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html=urlopen('http://www.pythonscraping.com/pages/page3.html')
bs=BeautifulSoup(html, 'html.parser')

for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [14]:
# previous_siblings 속성
# 선택된 행 이전의 항목들을 선택
for sibling in bs.find('tr',{'id':'gift2'}).previous_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


