## 데이터 다운로드하기

In [1]:
import urllib.request

url = "http://uta.pw/shodou/img/28/214.png"
savename = "test.png"

urllib.request.urlretrieve(url, savename)

print("저장되었습니다.")

저장되었습니다.


urllib.request.urlretrieve(url, filename=None, reporthook=None, data=None): [Link](https://docs.python.org/ko/3/library/urllib.request.html?highlight=urlretrieve#urllib.request.urlretrieve)
- Copy a network object denoted by a URL to a local file. If the URL points to a local file, the object will not be copied unless filename is supplied. Return a tuple (filename, headers) where filename is the local file name under which the object can be found, and headers is whatever the info() method of the object returned by urlopen() returned (for a remote object). Exceptions are the same as for urlopen().

- The second argument, if present, specifies the file location to copy to (if absent, the location will be a tempfile with a generated name). The third argument, if present, is a callable that will be called once on establishment of the network connection and once after each block read thereafter. The callable will be passed three arguments; a count of blocks transferred so far, a block size in bytes, and the total size of the file. The third argument may be -1 on older FTP servers which do not return a file size in response to a retrieval request.

In [5]:
import urllib.request

url = "http://api.aoikujira.com/ip/ini"
res = urllib.request.urlopen(url)
data = res.read()

data

b'[ip]\nAPI_URI=http://api.aoikujira.com/ip/get.php\nREMOTE_ADDR=223.38.62.202\nREMOTE_HOST=223.38.62.202\nREMOTE_PORT=60150\nHTTP_HOST=api.aoikujira.com\nHTTP_USER_AGENT=Python-urllib/3.8\nHTTP_ACCEPT_LANGUAGE=\nHTTP_ACCEPT_CHARSET=\nSERVER_PORT=80\nFORMAT=ini\n\n'

In [6]:
text = data.decode('utf-8')
print(text)

[ip]
API_URI=http://api.aoikujira.com/ip/get.php
REMOTE_ADDR=223.38.62.202
REMOTE_HOST=223.38.62.202
REMOTE_PORT=60150
HTTP_HOST=api.aoikujira.com
HTTP_USER_AGENT=Python-urllib/3.8
HTTP_ACCEPT_LANGUAGE=
HTTP_ACCEPT_CHARSET=
SERVER_PORT=80
FORMAT=ini




In [9]:
import urllib.request
import urllib.parse
API = "http://www.kma.go.kr/weather/forecast/mid-term-rss3.jsp"
values = {
    'stnId': '108'
}
params = urllib.parse.urlencode(values)
url = API + "?" + params
print("url=", url)
data = urllib.request.urlopen(url).read()
text = data.decode("utf-8")
print(text)

url= http://www.kma.go.kr/weather/forecast/mid-term-rss3.jsp?stnId=108
<?xml version="1.0" encoding="utf-8" ?>
<rss version="2.0">
<channel>
<title>기상청 육상 중기예보</title>
<link>http://www.kma.go.kr/weather/forecast/mid-term_01.jsp</link>
<description>기상청 날씨 웹서비스</description>
<language>ko</language>
<generator>기상청</generator>
<pubDate>2020년 01월 14일 (화)요일 06:00</pubDate>
 <item>
<author>기상청</author>
<category>육상중기예보</category>
<title>전국 육상 중기예보 - 2020년 01월 14일 (화)요일 06:00 발표</title>
<link>http://www.kma.go.kr/weather/forecast/mid-term_01.jsp</link>
<guid>http://www.kma.go.kr/weather/forecast/mid-term_01.jsp</guid>
<description>
	<header>
		<title>전국 육상중기예보</title>
		<tm>202001140600</tm>
		<wf><![CDATA[기압골의 영향으로 19일은 서울.경기도와 강원영서, 충청도, 전북, 제주도에 비 또는 눈이 오겠고, 22~24일은 제주도에 비가 오겠습니다.<br />그 밖의 날은 고기압의 가장자리에 들어 가끔 구름많겠습니다. <br />기온은 평년(최저기온: -12~0℃, 최고기온: 1~8℃)보다 조금 높겠습니다.<br />강수량은 평년(0~3mm)보다 적겠으나, 서울.경기도와 강원영서, 충청도, 전북은 비슷하겠고, 제주도는 조금 많겠습니다.]]></wf>
	</header>
	<body>
				

		<location wl_ve

## 스크레이핑하기

In [14]:
from bs4 import BeautifulSoup

html = """
<html><body>
<h1 id="title">웹 스크레이핑이란?</h1>
<p id="body">웹 페이지를 분석하는 것</p>
<p>원하는 부분을 추출하는 것</p>
</body>
</html>
"""

soup = BeautifulSoup(html, 'html.parser')

# h1 = soup.html.body.h1
# p1 = soup.html.body.p
# p2 = p1.next_sibling.next_sibling

h1 = soup.find(id="title")
body = soup.find(id="body")

print(f"#title = {h1.string}")
print(f"#body = {body.string}")
# print(f"p2 = {p2.string}")

#title = 웹 스크레이핑이란?
#body = 웹 페이지를 분석하는 것


In [15]:
from bs4 import BeautifulSoup

html = """
<html><body>
<ul>
<li><a href="http://www.naver.com">naver</a></li>
<li><a href="http://www.daum.net">daum</a></li>
</ul>
</body>
</html>
"""

soup = BeautifulSoup(html, 'html.parser')

links = soup.find_all("a")

for a in links:
    href = a.attrs['href']
    text = a.string
    print(text, ">", href)

naver > http://www.naver.com
daum > http://www.daum.net


In [18]:
from bs4 import BeautifulSoup
import urllib.request as req

url = "http://www.kma.go.kr/weather/forecast/mid-term-rss3.jsp"

res = req.urlopen(url)

soup = BeautifulSoup(res, "html.parser")

title = soup.find("title").string
wf = soup.find("wf").string
print(title)
print(wf)

기상청 육상 중기예보
기압골의 영향으로 19일은 서울.경기도와 강원영서, 충청도, 전북, 제주도에 비 또는 눈이 오겠고, 22~24일은 제주도에 비가 오겠습니다.<br />그 밖의 날은 고기압의 가장자리에 들어 가끔 구름많겠습니다. <br />기온은 평년(최저기온: -12~0℃, 최고기온: 1~8℃)보다 조금 높겠습니다.<br />강수량은 평년(0~3mm)보다 적겠으나, 서울.경기도와 강원영서, 충청도, 전북은 비슷하겠고, 제주도는 조금 많겠습니다.


In [19]:
# from bs4 import BeautifulSoup
# import urllib.request as req

# url = "http://info.finance.naver.com/marketindex/"
# res = req.urlopen(url)

# soup = BeautifulSoup(res, "html.parser")

# price = soup.select_one("div.head_info > span.value").string

# print("usd/krw =", price)

KeyboardInterrupt: 

## CSS 선택자

In [34]:
from bs4 import BeautifulSoup
import urllib.request as req
import urllib.parse

url = "https://ko.m.wikisource.org/wiki/"
author = "저자:윤동주"
url = url + urllib.parse.quote(author)
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")

# "html.client-js.mf-font-size-null.issues-group-B body.mediawiki.ltr.sitedir-ltr.mw-hide-empty-elt.ns-100.ns-subject.mw-editable.page-저자_윤동주.rootpage-저자_윤동주.stable.skin-minerva.action-view.animations div#mw-mf-viewport div#mw-mf-page-center main#content.mw-body div#bodyContent.content div#mw-content-text.mw-content-ltr div.mw-parser-output section#content-collapsible-block-0.mf-section-1.collapsible-block.open-block ul li ul li a"

a_list = soup.select("section.mf-section-1 > ul > li a")

# soup

for a in a_list:
    name = a.string
    print("-", name)

- 하늘과 바람과 별과 시
- 증보판
- 서시
- 자화상
- 소년
- 눈 오는 지도
- 돌아와 보는 밤
- 병원
- 새로운 길
- 간판 없는 거리
- 태초의 아침
- 또 태초의 아침
- 새벽이 올 때까지
- 무서운 시간
- 십자가
- 바람이 불어
- 슬픈 족속
- 눈감고 간다
- 또 다른 고향
- 길
- 별 헤는 밤
- 흰 그림자
- 사랑스런 추억
- 흐르는 거리
- 쉽게 씌어진 시
- 봄
- 참회록
- 간(肝)
- 위로
- 팔복
- 못자는밤
- 달같이
- 고추밭
- 아우의 인상화
- 사랑의 전당
- 이적
- 비오는 밤
- 산골물
- 유언
- 창
- 바다
- 비로봉
- 산협의 오후
- 명상
- 소낙비
- 한난계
- 풍경
- 달밤
- 장
- 밤
- 황혼이 바다가 되어
- 아침
- 빨래
- 꿈은 깨어지고
- 산림
- 이런날
- 산상
- 양지쪽
- 닭
- 가슴 1
- 가슴 2
- 비둘기
- 황혼
- 남쪽 하늘
- 창공
- 거리에서
- 삶과 죽음
- 초한대
- 산울림
- 해바라기 얼굴
- 귀뚜라미와 나와
- 애기의 새벽
- 햇빛·바람
- 반디불
- 둘 다
- 거짓부리
- 눈
- 참새
- 버선본
- 편지
- 봄
- 무얼 먹구 사나
- 굴뚝
- 햇비
- 빗자루
- 기왓장 내외
- 오줌싸개 지도
- 병아리
- 조개껍질
- 겨울
- 트루게네프의 언덕
- 달을 쏘다
- 별똥 떨어진 데
- 화원에 꽃이 핀다
- 종시


## 링크에 있는 것을 한꺼번에 내려받기

In [35]:
from urllib.parse import urljoin

base = "http://example.com/html/a.html"

print(urljoin(base, "b.html"))
print(urljoin(base, "sub/c.html"))
print(urljoin(base, "../index.html"))
print(urljoin(base, "../img/hoge.png"))
print(urljoin(base, "../css/hoge.css"))

http://example.com/html/b.html
http://example.com/html/sub/c.html
http://example.com/index.html
http://example.com/img/hoge.png
http://example.com/css/hoge.css


urllib.parse.urljoin(base, url, allow_fragments=True): [Link](https://docs.python.org/ko/3/library/urllib.parse.html#urllib.parse.urljoin)
- Construct a full (《absolute》) URL by combining a 《base URL》 (base) with another URL (url). Informally, this uses components of the base URL, in particular the addressing scheme, the network location and (part of) the path, to provide missing components in the relative URL.

In [40]:
from bs4 import BeautifulSoup
from urllib.request import *
from urllib.parse import *
from os import makedirs
import os.path, time, re

proc_files = {}

def enum_links(html, base):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.select("link[rel='stylesheet']")
    links += soup.select("a[href]")
    result = []
    for a in links:
        href = a.attrs['href']
        url = urljoin(base, href)
        result.append(url)
    return result

def download_file(url):
    o = urlparse(url)
    savepath = "./" + o.netloc + o.path
    if re.search(r"/$", savepath):
        savepath += "index.html"
    savedir = os.path.dirname(savepath)
    if os.path.exists(savepath):
        return savepath
    if not os.path.exists(savedir):
        print("mkdir=", savedir)
        makedirs(savedir)
    try:
        print("download=", url)
        urlretrieve(url, savepath)
        time.sleep(1)
        return savepath
    except:
        print("다운 실패", url)
        return None
    
def analyze_html(url, root_url):
    savepath = download_file(url)
    if savepath is None:
        return
    if savepath in proc_files:
        return
    proc_files[savepath] = True
    print("analyze_html=", url)
    html = open(savepath, "r", encoding="utf-8").read()
    links = enum_links(html, url)
    for link_url in links:
        if link_url.find(root_url) != 0:
            if not re.search(r".css$", link_url):
                continue
        if re.search(r".(html|htm)$", link_url):
            analyze_html(link_url, root_url)
            continue
        download_file(link_url)
        
url = "https://docs.python.org/3.8/library/"
analyze_html(url, url)

analyze_html= https://docs.python.org/3.8/library/
analyze_html= https://docs.python.org/3.8/library/intro.html
analyze_html= https://docs.python.org/3.8/library/functions.html
analyze_html= https://docs.python.org/3.8/library/constants.html
analyze_html= https://docs.python.org/3.8/library/stdtypes.html
analyze_html= https://docs.python.org/3.8/library/exceptions.html
analyze_html= https://docs.python.org/3.8/library/text.html
analyze_html= https://docs.python.org/3.8/library/string.html
analyze_html= https://docs.python.org/3.8/library/re.html
analyze_html= https://docs.python.org/3.8/library/difflib.html
analyze_html= https://docs.python.org/3.8/library/textwrap.html
analyze_html= https://docs.python.org/3.8/library/unicodedata.html
analyze_html= https://docs.python.org/3.8/library/stringprep.html
analyze_html= https://docs.python.org/3.8/library/readline.html
analyze_html= https://docs.python.org/3.8/library/rlcompleter.html
analyze_html= https://docs.python.org/3.8/library/binary.

download= https://docs.python.org/3.8/library/ctypes.html
analyze_html= https://docs.python.org/3.8/library/ctypes.html
download= https://docs.python.org/3.8/library/concurrency.html
analyze_html= https://docs.python.org/3.8/library/concurrency.html
download= https://docs.python.org/3.8/library/threading.html
analyze_html= https://docs.python.org/3.8/library/threading.html
download= https://docs.python.org/3.8/library/multiprocessing.html
analyze_html= https://docs.python.org/3.8/library/multiprocessing.html
download= https://docs.python.org/3.8/library/multiprocessing.shared_memory.html
analyze_html= https://docs.python.org/3.8/library/multiprocessing.shared_memory.html
download= https://docs.python.org/3.8/library/concurrent.html
analyze_html= https://docs.python.org/3.8/library/concurrent.html
download= https://docs.python.org/3.8/library/concurrent.futures.html
analyze_html= https://docs.python.org/3.8/library/concurrent.futures.html
download= https://docs.python.org/3.8/library/su

analyze_html= https://docs.python.org/3.8/library/uu.html
download= https://docs.python.org/3.8/library/markup.html
analyze_html= https://docs.python.org/3.8/library/markup.html
download= https://docs.python.org/3.8/library/html.html
analyze_html= https://docs.python.org/3.8/library/html.html
download= https://docs.python.org/3.8/library/html.parser.html
analyze_html= https://docs.python.org/3.8/library/html.parser.html
download= https://docs.python.org/3.8/library/html.entities.html
analyze_html= https://docs.python.org/3.8/library/html.entities.html
download= https://docs.python.org/3.8/library/xml.html
analyze_html= https://docs.python.org/3.8/library/xml.html
download= https://docs.python.org/3.8/library/xml.etree.elementtree.html
analyze_html= https://docs.python.org/3.8/library/xml.etree.elementtree.html
download= https://docs.python.org/3.8/library/xml.dom.html
analyze_html= https://docs.python.org/3.8/library/xml.dom.html
download= https://docs.python.org/3.8/library/xml.dom.mi

analyze_html= https://docs.python.org/3.8/library/development.html
download= https://docs.python.org/3.8/library/typing.html
analyze_html= https://docs.python.org/3.8/library/typing.html
download= https://docs.python.org/3.8/library/pydoc.html
analyze_html= https://docs.python.org/3.8/library/pydoc.html
download= https://docs.python.org/3.8/library/doctest.html
analyze_html= https://docs.python.org/3.8/library/doctest.html
download= https://docs.python.org/3.8/library/unittest.html
analyze_html= https://docs.python.org/3.8/library/unittest.html
download= https://docs.python.org/3.8/library/unittest.mock.html
analyze_html= https://docs.python.org/3.8/library/unittest.mock.html
download= https://docs.python.org/3.8/library/unittest.mock-examples.html
analyze_html= https://docs.python.org/3.8/library/unittest.mock-examples.html
download= https://docs.python.org/3.8/library/2to3.html
analyze_html= https://docs.python.org/3.8/library/2to3.html
download= https://docs.python.org/3.8/library/t

analyze_html= https://docs.python.org/3.8/library/winreg.html
download= https://docs.python.org/3.8/library/winsound.html
analyze_html= https://docs.python.org/3.8/library/winsound.html
download= https://docs.python.org/3.8/library/unix.html
analyze_html= https://docs.python.org/3.8/library/unix.html
download= https://docs.python.org/3.8/library/posix.html
analyze_html= https://docs.python.org/3.8/library/posix.html
download= https://docs.python.org/3.8/library/pwd.html
analyze_html= https://docs.python.org/3.8/library/pwd.html
download= https://docs.python.org/3.8/library/spwd.html
analyze_html= https://docs.python.org/3.8/library/spwd.html
download= https://docs.python.org/3.8/library/grp.html
analyze_html= https://docs.python.org/3.8/library/grp.html
download= https://docs.python.org/3.8/library/crypt.html
analyze_html= https://docs.python.org/3.8/library/crypt.html
download= https://docs.python.org/3.8/library/termios.html
analyze_html= https://docs.python.org/3.8/library/termios.h