# Beautiful Soup Crawling

In [1]:
from bs4 import BeautifulSoup 

### 파일로 부터 읽기

In [2]:
with open('00_Example.html', encoding='utf8') as html:
    soup = BeautifulSoup(html, 'html.parser')

In [3]:
print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Web Crawling Example</title>
</head>
<body>
<div>
<p>a</p><p>b</p><p>c</p>
</div>
<div class="ex_class">
<p>1</p><p>2</p><p>3</p>
</div>
<div id="ex_id">
<p>X</p><p>Y</p><p>Z</p>
</div>
<h1>This is a heading.</h1>
<p>This is a paragraph.</p>
<p>This is another paragraph.</p>
<a class="a" href="www.naver.com">Naver</a>
</body>
</html>


### 인터넷에서 가져오기

In [4]:
import urllib.request
import urllib.parse

In [5]:
web_url = 'http://200.1.220.217:3000/bbs/list/1'
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [6]:
print(soup)

<!DOCTYPE html>

<html lang="ko">
<head>
<title>My BBS</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="/fontawesome/css/all.min.css" rel="stylesheet"/>
<script src="/jquery/jquery.min.js"></script>
<script src="/popper/popper.min.js"></script>
<script src="/bootstrap/js/bootstrap.min.js"></script>
</head>
<body>
<nav class="navbar navbar-expand-lg bg-dark navbar-dark fixed-top">
<a class="navbar-brand" href="#">
<img alt="호서직업능력개발원" src="/img/hoseo.png" style="height: 40px; margin-left: 50px; margin-right: 80px;"/>
</a>
<ul class="nav mr-auto">
<li class="nav-item nav-light">
<a class="nav-link" href="/"><i class="fas fa-home"></i>홈</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/bbs/write"><i class="far fa-edit"></i>글쓰기</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/user/dispatch"><i class="far fa-user"></i>사용자</a>
</li>
<li class="n

### Beautiful Soup 사용법

In [24]:
with open('00_Example.html', encoding='utf8') as html:
    soup = BeautifulSoup(html, 'html.parser')

#### 태그를 이용해서 찾기

In [8]:
first_div = soup.find('div')
first_div

<div>
<p>a</p><p>b</p><p>c</p>
</div>

In [9]:
all_divs = soup.find_all('div')
print(all_divs)

[<div>
<p>a</p><p>b</p><p>c</p>
</div>, <div class="ex_class">
<p>1</p><p>2</p><p>3</p>
</div>, <div id="ex_id">
<p>X</p><p>Y</p><p>Z</p>
</div>]


In [12]:
all_ps = soup.find_all('p')
print(all_ps)

[<p>a</p>, <p>b</p>, <p>c</p>, <p>1</p>, <p>2</p>, <p>3</p>, <p>X</p>, <p>Y</p>, <p>Z</p>, <p>This is a paragraph.</p>, <p>This is another paragraph.</p>]


In [13]:
some_ps = first_div.find_all('p')
print(some_ps)

[<p>a</p>, <p>b</p>, <p>c</p>]


#### 태그와 속성을 이용해서 가져오기
- find('태그명', {'속성명1': '값1', ...}})
- find_all('태그명', {'속성명1': '값1', ...}})

In [14]:
ex_id = soup.find('div', {'id': 'ex_id'})
ex_id

<div id="ex_id">
<p>X</p><p>Y</p><p>Z</p>
</div>

In [15]:
ex_id = soup.find('div', id='ex_id')
ex_id

<div id="ex_id">
<p>X</p><p>Y</p><p>Z</p>
</div>

In [19]:
ex_class = soup.find('div', {'class': 'ex_class'})
ex_class

<div class="ex_class">
<p>1</p><p>2</p><p>3</p>
</div>

In [20]:
ex_class = soup.find('div', 'ex_class')
ex_class

<div class="ex_class">
<p>1</p><p>2</p><p>3</p>
</div>

In [22]:
ex_class = soup.find(class_ = 'ex_class')
ex_class

<div class="ex_class">
<p>1</p><p>2</p><p>3</p>
</div>

#### CSS Selector를 이용해서 가져오기
- select_one('#id')         --> 객체 하나
- select('.class1.class2')  --> 객체 리스트

In [18]:
# id
ex_id = soup.select('#ex_id')
print(ex_id)

[<div id="ex_id">
<p>X</p><p>Y</p><p>Z</p>
</div>]


In [23]:
# class
ex_class = soup.select('.ex_class')
print(ex_id)

[<div id="ex_id">
<p>X</p><p>Y</p><p>Z</p>
</div>]


In [25]:
soup.select('.sample')

[<div class="ex_class sample">
 <p>1</p><p>2</p><p>3</p>
 </div>,
 <a class="a sample" href="www.naver.com">Naver</a>]

In [26]:
soup.select('.a.sample')

[<a class="a sample" href="www.naver.com">Naver</a>]

In [27]:
soup.select_one('.a.sample')

<a class="a sample" href="www.naver.com">Naver</a>

#### 결과 가져오기

In [29]:
ex_id

[<div id="ex_id">
 <p>X</p><p>Y</p><p>Z</p>
 </div>]

In [33]:
first_p = ex_id[0].find('p')
first_p.get_text()

'X'

In [34]:
first_p.string

'X'

In [30]:
ex_class

[<div class="ex_class">
 <p>1</p><p>2</p><p>3</p>
 </div>]

In [35]:
class_ps = ex_class[0].find_all('p')
for one_p in class_ps:
    print(one_p.string)

1
2
3


#### 속성값 가져오기

In [37]:
a_tag = soup.find('a')
a_tag.get_text()

'Naver'

In [38]:
a_tag.attrs['href']

'www.naver.com'

### BBS 사이트 크롤링하기

In [39]:
web_url = 'http://200.1.220.217:3000/bbs/list/1'
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [42]:
table = soup.find('table')
rows = table.select('.d-flex')
first_row = rows[1]
first_row

<tr class="d-flex">
<td class="col-1" style="text-align: center;">1013</td>
<td class="col-6"><a href="/bbs/bid/1013/inc/1"><strong>리눅스 글쓰기</strong></a></td>
<td class="col-2" style="text-align: center;">홍길동</td>
<td class="col-2" style="text-align: center;">2020-10-27</td>
<td class="col-1" style="text-align: center;">3</td>
</tr>

In [44]:
tds = first_row.find_all('td')
tds

[<td class="col-1" style="text-align: center;">1013</td>,
 <td class="col-6"><a href="/bbs/bid/1013/inc/1"><strong>리눅스 글쓰기</strong></a></td>,
 <td class="col-2" style="text-align: center;">홍길동</td>,
 <td class="col-2" style="text-align: center;">2020-10-27</td>,
 <td class="col-1" style="text-align: center;">3</td>]

In [45]:
for td in tds:
    print(td.string)

1013
리눅스 글쓰기
홍길동
2020-10-27
3


#### 데이터 프레임으로 만들기

In [60]:
trs = soup.find_all('tr')
bids = []; titles = []; names = []
times = []; view_counts = []; reply_counts = []

In [61]:
for tr in trs[1:]:
    tds = tr.find_all('td')
    span = tds[1].find('span')
    reply_count = span.string[1:-1] if span else '0'
    index = tds[1].get_text().find('[')
    title = tds[1].get_text()[:index] if span else tds[1].get_text()
    #print(reply_count, title)
    bids.append(tds[0].string)
    titles.append(title)
    names.append(tds[2].string)
    times.append(tds[3].string)
    view_counts.append(tds[4].string)
    reply_counts.append(reply_count)

In [62]:
titles

['리눅스 글쓰기',
 '마이크로소프트 엣지에서의 동작',
 '역적 - 백성을 훔친 도적',
 '홍길동전',
 '대조영',
 'Linux server를 원격 Windows에서 접속하여 글쓰기',
 'ubuntu에서 글쓰기',
 '슬기로운 의사생활',
 '파리의 연인',
 '시크릿 가든']

In [64]:
import pandas as pd 
bbs = pd.DataFrame({
        'bid': bids, 
        'title': titles,
        'name': names,
        'time': times,
        'view_count': view_counts,
        'reply_count': reply_counts
    })
bbs = bbs.set_index('bid')
bbs

Unnamed: 0_level_0,title,name,time,view_count,reply_count
bid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1013,리눅스 글쓰기,홍길동,2020-10-27,3,0
1012,마이크로소프트 엣지에서의 동작,김은숙,2020-10-23,14,2
1011,역적 - 백성을 훔친 도적,홍길동,2020-10-23,3,1
1010,홍길동전,홍길동,2020-10-23,3,0
1009,대조영,대조영,2020-10-23,9,2
1008,Linux server를 원격 Windows에서 접속하여 글쓰기,관리자,2020-10-23,7,3
1007,ubuntu에서 글쓰기,관리자,2020-10-22,5,1
1006,슬기로운 의사생활,이우정,2020-10-22,11,3
1005,파리의 연인,김은숙,2020-10-22,8,4
1004,시크릿 가든,김은숙,2020-10-22,4,0


In [65]:
base_url = 'http://200.1.220.217:3000/bbs/list/'
page_url = '1'
web_url = base_url + page_url
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [66]:
lis = soup.select('.page-item')     # soup.find_all('li')
lis

[<li class="page-item">
 <a aria-label="Previous" class="page-link active" href="#">
 <span aria-hidden="true">«</span></a>
 </li>,
 <li aria-current="page" class="page-item active">
 <span class="page-link">
                                 1<span class="sr-only">(current)</span>
 </span>
 </li>,
 <li class="page-item"><a class="page-link" href="/bbs/list/2">2</a></li>,
 <li class="page-item">
 <a aria-label="Next" class="page-link" href="#">
 <span aria-hidden="true">»</span></a>
 </li>]

In [68]:
page = int(lis[-2].string)
page

2

In [69]:
bids = []; titles = []; names = []
times = []; view_counts = []; reply_counts = []

In [70]:
for i in range(page):
    page_url = str(i+1)
    web_url = base_url + page_url
    with urllib.request.urlopen(web_url) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
        trs = soup.find_all('tr')
        for tr in trs[1:]:
            tds = tr.find_all('td')
            span = tds[1].find('span')
            reply_count = span.string[1:-1] if span else '0'
            index = tds[1].get_text().find('[')
            title = tds[1].get_text()[:index] if span else tds[1].get_text()
            #print(reply_count, title)
            bids.append(tds[0].string)
            titles.append(title)
            names.append(tds[2].string)
            times.append(tds[3].string)
            view_counts.append(tds[4].string)
            reply_counts.append(reply_count)

In [71]:
bbs = pd.DataFrame({
        'bid': bids, 
        'title': titles,
        'name': names,
        'time': times,
        'view_count': view_counts,
        'reply_count': reply_counts
    })
bbs = bbs.set_index('bid')
bbs

Unnamed: 0_level_0,title,name,time,view_count,reply_count
bid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1013,리눅스 글쓰기,홍길동,2020-10-27,3,0
1012,마이크로소프트 엣지에서의 동작,김은숙,2020-10-23,14,2
1011,역적 - 백성을 훔친 도적,홍길동,2020-10-23,4,1
1010,홍길동전,홍길동,2020-10-23,3,0
1009,대조영,대조영,2020-10-23,9,2
1008,Linux server를 원격 Windows에서 접속하여 글쓰기,관리자,2020-10-23,7,3
1007,ubuntu에서 글쓰기,관리자,2020-10-22,5,1
1006,슬기로운 의사생활,이우정,2020-10-22,12,3
1005,파리의 연인,김은숙,2020-10-22,9,4
1004,시크릿 가든,김은숙,2020-10-22,4,0
