In [2]:
from bs4 import BeautifulSoup
import re
import json
from IPython.display import Image

Always use BeautifulSoup to convert html-formatted strings, to 'html'

# soup.find() to find certain tag

In [22]:
example_tag = "<div class = 'large_div'><p class='myclass1' id='myid1'> Sample Sentence 1 </p><p class='myclass2' id='myid2'> Sample Sentence 2 </p></div>"
example_soup = BeautifulSoup(example_tag, "html.parser")
print(example_soup)

<div class="large_div"><p class="myclass1" id="myid1"> Sample Sentence 1 </p><p class="myclass2" id="myid2"> Sample Sentence 2 </p></div>


In [4]:
example_soup.find("p")

<p class="myclass1" id="myid1"> Sample Sentence 1 </p>

In [5]:
example_soup.find_all("p")

[<p class="myclass1" id="myid1"> Sample Sentence 1 </p>,
 <p class="myclass2" id="myid2"> Sample Sentence 2 </p>]

* find_all()takes all tages, while find() only takes one (first one)
* You can specify with attributes, like 'id', 'class', etc...

Here, if you believe the web is built with 'highly strict rule', so orders and and structure of all the contents & tags are unified, you may use 'indexing' from find_all() to get info you want. 

For example, if first "p" tag always include title, and second "p" tag always include contents...

However, in most of the cases there are lots of variances & exception, so it is better to find well-defined attributes for data you are looking for

In [6]:
example_soup.find("p", {"class" : "myclass1"})

<p class="myclass1" id="myid1"> Sample Sentence 1 </p>

In [7]:
example_soup.find("p", {"class" : "myclass2"})

<p class="myclass2" id="myid2"> Sample Sentence 2 </p>

You can also find and defind external tag first, and narrow down to find information you are looking for

In [8]:
divtag = example_soup.find("div", {"class" : "large_div"})
print (divtag)

<div class="large_div"><p class="myclass1" id="myid1"> Sample Sentence 1 </p><p class="myclass2" id="myid2"> Sample Sentence 2 </p></div>


In [9]:
data = divtag.find("p", {"id" : "myid2"})
data.text

' Sample Sentence 2 '

You can also find only with attributes, as 'class' or 'id', without specifying tag

In [10]:
print (divtag.find(class_ = "myclass1").text)
print (divtag.find(attrs = {"class" : "myclass2"}).text)
print (divtag.find('p', id = "myid1").text)
#'class' is reserved for python

 Sample Sentence 1 
 Sample Sentence 2 
 Sample Sentence 1 


Also, you can extract 'attribute value' from tags

In [11]:
example_soup.find("p", {"class" : "myclass1"})["id"]

'myid1'

# soup.select() and CSS Selector

You can also use select(), instead of find()

* find() >> select_one()
* find_all() >>select()

In [12]:
example_soup.select('p')

[<p class="myclass1" id="myid1"> Sample Sentence 1 </p>,
 <p class="myclass2" id="myid2"> Sample Sentence 2 </p>]

In [13]:
example_soup.select_one('p')

<p class="myclass1" id="myid1"> Sample Sentence 1 </p>

In [14]:
print (example_soup.find('div').find('p'))
print (example_soup.select_one('div > p'))

<p class="myclass1" id="myid1"> Sample Sentence 1 </p>
<p class="myclass1" id="myid1"> Sample Sentence 1 </p>


You can specify class with '.', id with '#'

In [15]:
print (example_soup.select_one('.myclass1'))
print (example_soup.select_one('p.myclass1'))
print (example_soup.select_one('p#myid1'))
print (example_soup.select_one('p#myid1.myclass1'))

<p class="myclass1" id="myid1"> Sample Sentence 1 </p>
<p class="myclass1" id="myid1"> Sample Sentence 1 </p>
<p class="myclass1" id="myid1"> Sample Sentence 1 </p>
<p class="myclass1" id="myid1"> Sample Sentence 1 </p>


# Example with Naver News

In [16]:
with open("news.html", "r") as f:
    soup = BeautifulSoup(f, 'html.parser')
    #Here, f is loaded as string file

Let's try to get all the titles, and links of each articles. 
We can see that each of articles are included in  :
```
<div class = "content">
    <div...>
        <ul...>
            <li>
            <li>
            ...
        </ul>
    </div>
</div>

```
Multiple articles are included as lists

![title](img/1.png)

![title](img/2.png)

In the 'li' tag, titles are located inside 'a' tag, which include both title text and 'hyperlink' as attribute

![title](img/3.png)

In [17]:
contents_box = soup.find("div", {"class" :  "content", "id" : "main_content"})
article_lists =  contents_box.find_all("li")

## Question: Here, why is it safe (or okay) to use find_all("li"), without specifying attribute?

In [18]:
print ("Number of articles: {}".format(len(article_lists)))

Number of articles: 20


In [19]:
def get_info(info):
    title = info.find_all("a")[-1].text.strip() #Why do we index [-1] for "a", and why is this safe measure here?
    link = info.find("a")["href"]
    
    return{title:link}

## Question: Here, Why do we index [-1] for "a", and why is this safe measure here?

In [20]:
collection = {}
for i in range(0, len(article_lists)):
    info = article_lists[i]
    collection[i] = get_info(info) 

In [21]:
collection

{0: {'SPAIN FOREST FIRE': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=091&aid=0008873298'},
 1: {'SPAIN FOREST FIRE': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=091&aid=0008873297'},
 2: {'[마감]지수선물 보합..431.50(0.00p)': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=018&aid=0004993139'},
 3: {'[마감]국채선물 상승..110.30 +4틱': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=018&aid=0004993138'},
 4: {'경제계가 이재용의 ‘가석방’ 아닌 ‘사면’ 원하는 이유': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=119&aid=0002513957'},
 5: {'동양생명, 우리금융지주 지분 전량 매각…왜?': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=032&aid=0003087433'},
 6: {'美 셔먼 "한반도 비핵화는 중국과 협조할 영역"': 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=002&aid=0002201455'},
 7: {"전주지법, 8월 6일까지 2주간 '하계 법정 휴정'": 'https://news.naver.com/main/read.naver?mode=LSD&mid=sec&sid1=001&oid=421&aid=0005497685'},
 