In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname="C:\\Windows\\Fonts\\malgun.ttf").get_name()
plt.rc("font", family=font_name)

import matplotlib as mlp
mlp.rcParams["axes.unicode_minus"] = False

# 1. XML

+ 특징
    - 메타 언어 : GML -> SGML -> XML
    - 데이터를 위한 언어
    - DBMS
    - 데이터 표준화 : 이기종 시스템간의 정보교환, 웹서비스, 유비쿼터스, 사물인터넷 등등 ...
    
+ 구성 요소
    - dtd
    - xml schema
    - xml
    - xsl, xslt
    
+ https://docs.python.org/3/library/xml.etree.elementtree.html

In [2]:
import xml.etree.ElementTree as et

##### XML api 사용 방법
# 1. xml이 파일로 존재하는 경우 : parse()
# 2. xml이 메모리에 로드된 경우 : fromstring()

## (1) XML 데이터 불러오기

In [3]:
##### xml이 파일로 존재하는 경우
tree1 = et.parse("data/users.xml")
tree1

<xml.etree.ElementTree.ElementTree at 0x1c208423640>

In [4]:
##### xml이 이미 메모리에 로드되어 있는 경우

xmlstr = """<?xml version="1.0" encoding="utf-8" ?>
<users>
    <user grade="gold">
            <name>Kim Cheol Soo</name>
            <age>25</age>
            <birthday>19940215</birthday>
        </user>
        <user grade="diamond">
            <name>Kim Yoo Mee</name>
            <age>21</age>
            <birthday>19980417</birthday>
        </user>
</users>
"""
tree2 = et.fromstring(xmlstr)
tree2

<Element 'users' at 0x000001C233609540>

## (2) XML 데이터 다루기

### 1) 태그명 검색

In [5]:
print(tree1.find("user"))
print(tree1.find("user[1]"))
print(tree1.find("user[2]"))

<Element 'user' at 0x000001C2335FBEF0>
<Element 'user' at 0x000001C2335FBEF0>
<Element 'user' at 0x000001C233609090>


In [6]:
data = tree1.find("user[2]")
print(type(data))
dir(data)

print(data.tag)
print(data.attrib)
print(data.get("grade"))

print("------------------------------------------------")

username = data.find("name")
print(username.tag)
print(username.attrib)
print(username.text)

<class 'xml.etree.ElementTree.Element'>
user
{'grade': 'diamond'}
diamond
------------------------------------------------
name
{}
Kim Yoo Mee


In [9]:
data = tree1.find("./user[@grade='diamond']")

print(data)
print(data.attrib)
print(data.keys())
print(data.items())

<Element 'user' at 0x000001C233609090>
{'grade': 'diamond'}
['grade']
[('grade', 'diamond')]


### 3) 여러 개의 태그를 한꺼번에 가져오기

In [11]:
users = tree1.findall("./user")
users

for i in users:
    print(i.attrib)
    print(i.find("name").text)

{'grade': 'gold'}
Kim Cheol Soo
{'grade': 'diamond'}
Kim Yoo Mee


# 2. JSON

In [12]:
import json

# dumps() : 데이터 저장
# loads() : 데이터 불러올 때

In [13]:
j1 = {"name" : "홍길동", "birth" : "0101", "age" : 20}
print(type(j1))
print(j1)

print("---------------------------------------------------------")

j2 = json.dumps(j1)
print(type(j2))
print(j2)

print("---------------------------------------------------------")

j3 = json.dumps((1, 2, 3))
print(type(j3))
print(j3)

print("---------------------------------------------------------")

j4 = json.loads(j2)
print(type(j4))
print(j4)


<class 'dict'>
{'name': '홍길동', 'birth': '0101', 'age': 20}
---------------------------------------------------------
<class 'str'>
{"name": "\ud64d\uae38\ub3d9", "birth": "0101", "age": 20}
---------------------------------------------------------
<class 'str'>
[1, 2, 3]
---------------------------------------------------------
<class 'dict'>
{'name': '홍길동', 'birth': '0101', 'age': 20}


In [14]:
obj = """
{
	"id": "0001",
	"type": "donut",
	"name": "Cake",
	"ppu": 0.55,
	"batters":
		{
			"batter":
				[
					{ "id": "1001", "type": "Regular" },
					{ "id": "1002", "type": "Chocolate" },
					{ "id": "1003", "type": "Blueberry" },
					{ "id": "1004", "type": "Devil's Food" }
				]
		},
	"topping":
		[
			{ "id": "5001", "type": "None" },
			{ "id": "5002", "type": "Glazed" },
			{ "id": "5005", "type": "Sugar" },
			{ "id": "5007", "type": "Powdered Sugar" },
			{ "id": "5006", "type": "Chocolate with Sprinkles" },
			{ "id": "5003", "type": "Chocolate" },
			{ "id": "5004", "type": "Maple" }
		]
}
"""

print(type(obj))


<class 'str'>


# 3. BeautifulSoup

+ https://www.crummy.com/software/BeautifulSoup/

## (1) 웹 소스 가져오기

In [7]:
import urllib.request as req
from urllib.error import HTTPError, URLError

In [8]:
google = req.urlopen("https://google.com")
html = google.read()
print(type(html))

<class 'bytes'>


In [15]:
try:
    google = req.urlopen("https://google.com")
except HTTPError as e:
    print("HTTPError : " , e)
except ULRError as e:
    print("URLError : " , e)
else:
    html = google.read()

In [16]:
req.urlretrieve("https://www.google.co.kr/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png", "data/google1.png")


('data/google1.png', <http.client.HTTPMessage at 0x1c2347c4910>)

In [17]:
img = req.urlopen("https://www.google.co.kr/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png")
img_data = img.read()
img_data

f = open("data/google2.png", "wb")
f.write(img_data)
f.close()

## (2) BeautifulSoup 사용법

+ pip install beautifulsoup4

In [20]:
from bs4 import BeautifulSoup

In [27]:
page = open("data/test_first.html")
page

<_io.TextIOWrapper name='data/test_first.html' mode='r' encoding='cp949'>

In [22]:
page.read()

'<!DOCTYPE html>\n<html><head>\n        <title>Very Simple HTML Code by Netsong7</title>\n    </head><body>\n        <div>\n            <p class="inner-text first-item" id="first">\n                Happy PinkWink.\n                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>\n            </p>\n            <p class="inner-text second-item">\n                Happy Data Science.\n                <a href="https://www.python.org" id="py-link">Python</a>\n            </p>\n        </div>\n        <p class="outer-text first-item" id="second">\n            <b>\n                Data Science is funny.\n            </b>\n        </p>\n        <p class="outer-text">\n            <b>\n                All I need is Love.\n            </b>\n        </p>\n    </body>\n</html>'

In [28]:
soup = BeautifulSoup(page, "html.parser")

In [29]:
soup

<!DOCTYPE html>

<html><head>
<title>Very Simple HTML Code by Netsong7</title>
</head><body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>

In [30]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Very Simple HTML Code by Netsong7
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy PinkWink.
    <a href="http://netsong7.synology.me" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science.
    <a href="https://www.python.org" id="py-link">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <b>
    All I need is Love.
   </b>
  </p>
 </body>
</html>


## 1)원하는 위치에 접근하는 방법(순차적인 방식)

In [46]:
print(list(soup.children))
print("---------------------------------------------")
print(list(soup.children)[0])
print("---------------------------------------------")
print(list(soup.children)[1])
print("---------------------------------------------")
print(list(soup.children)[2])

['html', '\n', <html><head>
<title>Very Simple HTML Code by Netsong7</title>
</head><body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>]
---------------------------------------------
html
---------------------------------------------


---------------------------------------------
<html><head>
<title>Very Simple HTML Code by Netsong7</title>
</head><body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">P

In [41]:
html = list(soup.children)[2]
print(list(html.children))
print("---------------------------------------------")
body = list(html.children)[1]
print(body)

[<head>
<title>Very Simple HTML Code by Netsong7</title>
</head>, <body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>, '\n']
---------------------------------------------
<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer

## 2)태그명으로 접근하는 방법

In [45]:
print(soup.head)
print("---------------------------------------------")
print(soup.body)
print("---------------------------------------------")
print(soup.body.div)
print("---------------------------------------------")
print(soup.body.div.p)
print("---------------------------------------------")
print(soup.p)

<head>
<title>Very Simple HTML Code by Netsong7</title>
</head>
---------------------------------------------
<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
---------------------------------------------
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</

## 3) 함수로 접근하는 방법

+ find(), find_all()

In [47]:
soup.find("p")

<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>

In [48]:
soup.find_all("p")

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [49]:
soup.find("p", class_="outer-text")

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

In [50]:
soup.find("p", id="second")

<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>

## 4) 다양한 순차 접근 방식

In [56]:
soup.head.nextSibling   # 형제
soup.body.previous_sibling
list(soup.body.div.next_siblings)
soup.body.div.parent

<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://netsong7.synology.me" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>

## 5) Text Node 가져오기

+ get_text()
+ text

In [58]:
soup.html.get_text()
soup.head.get_text()

'\nVery Simple HTML Code by Netsong7\n'

In [59]:
soup.head.title.get_text()

'Very Simple HTML Code by Netsong7'

In [63]:
soup.div.get_text()

'\n\n                Happy PinkWink.\n                PinkWink\n\n\n                Happy Data Science.\n                Python\n\n'

In [64]:
soup.p.get_text()

'\n                Happy PinkWink.\n                PinkWink\n'

In [66]:
for p in soup.find_all("p"):
    print(p.get_text())


                Happy PinkWink.
                PinkWink


                Happy Data Science.
                Python



                Data Science is funny.
            



                All I need is Love.
            



## 6) 속성에 접근하기

In [70]:
a = soup.find("a")
a

a["href"]

for n in soup.find_all("a"):
    print(n["href"])

http://netsong7.synology.me
https://www.python.org


# (3) 실습

## 1) 네이버에서 환율 정보 가져오기
+ https://finance.naver.com/marketindex/


In [71]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [73]:
# 웹 소스 준비
url = "https://finance.naver.com/marketindex/"
page = urlopen (url)

# 가져온 웹 소스를 트리 구조로 변환
soup = BeautifulSoup(page, "html.parser")

In [81]:
span = soup.find_all("span", class_="value")

print("미 환율:", span[0].get_text())

미 환율: 1,290.60


In [95]:
# div 태그로 접근
div = soup.find_all("div", class_="head_info point_dn")
print("미 환율:", div[0].span.get_text())
for span in div[0]:
    print(span.get_text())

미 환율: 1,290.60


1,290.60


원


 2.40


하락




In [98]:
# CSS

# soup.select_one(".value")
soup.select_one("div.head_info")

<div class="head_info point_dn">
<span class="value">1,290.60</span>
<span class="txt_krw"><span class="blind">원</span></span>
<span class="change"> 2.40</span>
<span class="blind">하락</span>
</div>

In [100]:
span = soup.select_one("div.head_info > span.value")
print(span.get_text())

1,290.60


In [101]:
spans = soup.select("div.head_info > span.value")
for span in spans:
    print(span.get_text())

1,290.60
955.61
1,359.26
193.18
134.9600
1.0536
1.2256
104.4800
107.99
2114.43
1835.6
76346.99


## 2) 스크래핑 연습

In [102]:
##### 웹 소스 가져오기
url = "http://www.pythonscraping.com/pages/warandpeace.html"
page = urlopen (url)

# 가져온 웹 소스를 트리 구조로 변환
soup = BeautifulSoup(page, "html.parser")

In [109]:
##### 녹색 단어만 골라오기
greens = soup.find_all("span", class_="green")
for green in greens:
    print(green.get_text())
    
print("------------------------------------------------")

span = soup.find_all("span", {"class" : "green"})
for s in span:
    print(s.get_text())
    
print("------------------------------------------------")

# greens = soup.select(".green")
greens = soup.select("div#text > span.green")
for green in greens:
    print(green.get_text())

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna
------------------------------------------------
Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the b

In [115]:
##### 제목(h1, h2)

h1 = soup.find("h1")
print(h1.get_text())

h2 = soup.find("h2")
print(h2.get_text())

titles = soup.find_all(["h1", "h2"])
print([t.get_text() for t in titles])

War and Peace
Chapter 1
['War and Peace', 'Chapter 1']


In [116]:
##### 웹 소스 가져오기
url = "http://www.pythonscraping.com/pages/page3.html"
page = urlopen (url)

# 가져온 웹 소스를 트리 구조로 변환
soup = BeautifulSoup(page, "html.parser")

In [None]:
##### 제목 행은 건너뛰고 나머지 모든 행을 수집 (클래스 속성이 없다고 가정)

# id 속성을 이용
tables = soup.find_all("tr", {"id":["gift1", "gift2", "gift3", "gift4", "gift5"]})
for text in tables:
    print(text.get_text())


In [None]:
# id 속성이 매우 많다면?(즉, 행의 갯수가 매우 많다면?)
tables = []
for i in range(1, 6):
    tables.extend(soup.find_all("tr", {"id":["gift{}".format(i)]}))
    
for text in tables:
    print(text.get_text())

In [None]:
# tables = list(soup.html.body.div.table.tr.next_siblings)
# for text in tables:
#     print(text.get_text())

tb = soup.find("table", {"id":"giftList"})
contents = tb.tr.next_siblings


for content in contents:
    print(type(content))

In [None]:
tb = soup.find("table", {"id":"giftList"})
contents = tb.tr.next_siblings
# tr태그 값만 가져오기
for content in contents:
    if(type(content) == bs4.element.Tag):
        for i in range(3):
            print(list(content.children)[int("{}".format(i))].get_text())
        print(list(content.children)[3].img["src"])
        print("-------------------------------------------------")

In [None]:
##### 가격 중에 $15.00 수집
contents = list(soup.html.body.div.table.tr.next_siblings)
for content in contents:
    if(type(content) == bs4.element.Tag):
        print(list(content.children)[2].get_text())
        break

In [None]:
print(soup.select("td")[2].get_text().replace("\n", ""))

In [None]:
print(soup.find_all("td")[2].get_text().replace("\n", ""))

In [None]:
price = soup.find("img", src="../img/gifts/img1.jpg").parent.previous_sibling
print(price.get_text().replace("\n", ""))

In [None]:
table = soup.find("table", id="giftList")
list(list(table.tr.next_siblings)[1].td.next_siblings)[1].get_text()


In [None]:
soup.find("table", id="giftList").td.next_sibling.next_sibling.get_text()