# HTTP 

In [58]:
from urllib import robotparser

In [59]:
robot = robotparser.RobotFileParser()
robot.set_url("https://www.google.com/robots.txt")
robot.read()
robot.can_fetch('*', '/search/')

False

In [60]:
from urllib import request

resp = request.urlopen("https://www.google.com")
resp.geturl() # response header에 있는 url
resp.reason  # 받았으면 ok, 못받으면 message 출력 / not found
resp.getcode() # 200 을 받아야 제대로 응답 / 404
print(resp.info()) # meta 정보들

resp.getheaders() # response의 header 출력

Date: Thu, 11 Jul 2019 04:13:53 GMT
Expires: -1
Cache-Control: private, max-age=0
Content-Type: text/html; charset=ISO-8859-1
P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info."
Server: gws
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN
Set-Cookie: 1P_JAR=2019-07-11-04; expires=Sat, 10-Aug-2019 04:13:53 GMT; path=/; domain=.google.com
Set-Cookie: NID=187=DJTK3UA92CZlZfic0Raj6ykhYl-dkLuJAse4mt0XeHti6qbC_jzsLFAjZjGShvI4tQbf62RRFGWAOXCjPsrCQzNNYUH1SN8jndQ-XehuiJekMB6Yu090EDcaZnsGZtO4Hakt8J5vj0GyrFuy6ZwroVUJtVq9eWA9a3w3o2vc0d4; expires=Fri, 10-Jan-2020 04:13:53 GMT; path=/; domain=.google.com; HttpOnly
Alt-Svc: quic=":443"; ma=2592000; v="46,43,39"
Accept-Ranges: none
Vary: Accept-Encoding
Connection: close




[('Date', 'Thu, 11 Jul 2019 04:13:53 GMT'),
 ('Expires', '-1'),
 ('Cache-Control', 'private, max-age=0'),
 ('Content-Type', 'text/html; charset=ISO-8859-1'),
 ('P3P', 'CP="This is not a P3P policy! See g.co/p3phelp for more info."'),
 ('Server', 'gws'),
 ('X-XSS-Protection', '0'),
 ('X-Frame-Options', 'SAMEORIGIN'),
 ('Set-Cookie',
  '1P_JAR=2019-07-11-04; expires=Sat, 10-Aug-2019 04:13:53 GMT; path=/; domain=.google.com'),
 ('Set-Cookie',
  'NID=187=DJTK3UA92CZlZfic0Raj6ykhYl-dkLuJAse4mt0XeHti6qbC_jzsLFAjZjGShvI4tQbf62RRFGWAOXCjPsrCQzNNYUH1SN8jndQ-XehuiJekMB6Yu090EDcaZnsGZtO4Hakt8J5vj0GyrFuy6ZwroVUJtVq9eWA9a3w3o2vc0d4; expires=Fri, 10-Jan-2020 04:13:53 GMT; path=/; domain=.google.com; HttpOnly'),
 ('Alt-Svc', 'quic=":443"; ma=2592000; v="46,43,39"'),
 ('Accept-Ranges', 'none'),
 ('Vary', 'Accept-Encoding'),
 ('Connection', 'close')]

In [61]:
print(resp.getcode())
print(resp.reason)

200
OK


In [None]:
resp = request.urlopen('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&')
resp.code, resp.reason
# 헤더를 지정하지 않아서 google에서 frobidden

## error handling

In [None]:
from urllib import error

In [None]:
try:
    resp = request.urlopen('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&')
except error.HTTPError as e:
        print(e.code, e.reason, e.headers)

## User-Agent
bot 이 아니라 클라이언트가 접속한 것 처럼 보이게 해야 한다.

user-agent를 수정한다

In [None]:
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}

In [None]:
try:
    req = request.Request('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&', headers=header)
    resp = request.urlopen(req)
except error.HTTPError as e:
    print(e.code, e.reason, e.headers)

In [None]:
resp.code, resp.reason, req.headers

In [63]:
tmp = resp.read().decode('utf-8')


In [47]:
from urllib import parse

In [None]:
parse.urlparse('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&')
parse.urljoin('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&', '/search/about')
parse.urlencode({'q':'파이썬'})
parse.quote_plus('파이썬')
parse.unquote_plus('%EB%B0%95%EB%B3%B4%EC%98%81')              

In [65]:
import requests

In [69]:
url = 'http://httpbin.org/get'
param = {'key':'value'}
resp = requests.request('GET', url, params=param)

In [71]:
print(resp.text)

{
  "args": {
    "key": "value"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.20.0"
  }, 
  "origin": "163.152.3.141, 163.152.3.141", 
  "url": "https://httpbin.org/get?key=value"
}



In [76]:
url = 'http://httpbin.org/post'
param = {'key':'value'}
resp = requests.request('post', url, data=param)
print(resp.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "key": "value"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "9", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.20.0"
  }, 
  "json": null, 
  "origin": "163.152.3.141, 163.152.3.141", 
  "url": "https://httpbin.org/post"
}



In [140]:
import time

In [162]:
def download(method, url, header=None, param=None, data=None, timeout=1, maxretries=3):
    try:
        resp = requests.request(method, url, headers=header, params=param, data=data)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and maxretries>0:
            time.sleep(timeout)
            print('시도 {}'.format(maxretries))
            download(method, url, header, param, data, timeout, maxretries-1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
    return resp

In [163]:
download('get', 'https://www.google.com', header)

<Response [200]>

In [164]:
resp = download('get', 'https://www.crawler-test.com/status_codes/status_403', header=header)

403
Forbidden


In [165]:
resp = download('get', 'https://www.crawler-test.com/status_codes/status_500', header=header)

시도 3
시도 2
시도 1
500
Internal Server Error


In [171]:
import json
resp = download('get', 'https://www.httpbin.org/get', param={"key":"value"})
obj = json.loads(resp.text)

In [172]:
obj

{'args': {'key': 'value'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Host': 'www.httpbin.org',
  'User-Agent': 'python-requests/2.20.0'},
 'origin': '163.152.3.141, 163.152.3.141',
 'url': 'https://www.httpbin.org/get?key=value'}

# 예제

In [210]:
url = "http://openapi.airkorea.or.kr/openapi/services/rest/ArpltnInforInqireSvc/getCtprvnRltmMesureDnsty"
# C4W9%252Fd8IlGwY%252Bevhcji%252B7%252FvFdLfOATYrbASRCle%252FAfnz%252BnNhefYyUkc1vibsttgJ1vKr8xBTktP%252BjoR35OGHYA%253D%253D
# 으로 자동 인코딩됨. 중복 인코딩된다
# 따라서 역으로 변환해줘야 함

params ={
    "ServiceKey" : "C4W9%2Fd8IlGwY%2Bevhcji%2B7%2FvFdLfOATYrbASRCle%2FAfnz%2BnNhefYyUkc1vibsttgJ1vKr8xBTktP%2BjoR35OGHYA%3D%3D",    
#     "ServiceKey" : requests.compat.unquote("C4W9%2Fd8IlGwY%2Bevhcji%2B7%2FvFdLfOATYrbASRCle%2FAfnz%2BnNhefYyUkc1vibsttgJ1vKr8xBTktP%2BjoR35OGHYA%3D%3D"),
    "sidoName" : "서울",
    "_returnType" : "JSON",
}

In [212]:
resStr = download('get', url, param=params)

In [220]:
resStr.headers

{'Date': 'Thu, 11 Jul 2019 05:58:33 GMT', 'Set-Cookie': 'WMONID=M0BM_ei4Ki9; Expires=Fri, 10-Jul-2020 14:58:35 GMT; Path=/, JSESSIONID=Nz01GFhkusUzfOle9vAb4vo3; Path=/openapi', 'Content-Language': 'ko-KR', 'Content-Length': '190', 'Connection': 'close', 'Content-Type': 'text/xml;charset=utf-8'}

In [208]:
resObj = resStr.json()

In [209]:
[{items['stationName']:items['pm25Value']} for items in resObj['list']]

[{'중구': '4'},
 {'한강대로': '3'},
 {'종로구': '5'},
 {'청계천로': '5'},
 {'종로': '-'},
 {'용산구': '2'},
 {'광진구': '-'},
 {'성동구': '6'},
 {'강변북로': '-'},
 {'중랑구': '-'}]

-------------

In [230]:
## get
url = "https://search.naver.com/search.naver"
params = {'query':'박보영'}
resStr = download('get', url, param=params)

In [288]:
## post
url = "http://www.kyobobook.co.kr/search/SearchCommonMain.jsp"
params = {'vPstrCategory':'TOT', 'vPstrKeyWord':'박보영'.encode('euc-kr'), 'vPplace':'top'}
resStr = download('post', url, data=params)

In [289]:
resStr.request.body

'vPstrCategory=TOT&vPstrKeyWord=%B9%DA%BA%B8%BF%B5&vPplace=top'

In [290]:
resStr.text.find('아트와')

108224

In [333]:
url = "http://pythonscraping.com/pages/cookies/login.html"
requests.compat.urljoin(url, "welcome.php")
# requests.compat.urlparse(url)

'http://pythonscraping.com/pages/cookies/welcome.php'

In [350]:
url = "http://pythonscraping.com/pages/cookies/login.html"
url = requests.compat.urljoin(url, "welcome.php")
data = {
    "username" : "asdadaaaas",
    "password" : "password"
}
html = download('post', url, data=data)
pprint(html.text)

('\n'
 '<h2>Welcome to the Website!</h2>\n'
 'Whoops! You logged in wrong. Try again with any username, and the password '
 '"password"<br><a href="login.html">Log in here</a>')


In [344]:
session = requests.Session()

In [351]:
html = session.post(requests.compat.urljoin(url, "welcome.php"), data)
html.text

'\n<h2>Welcome to the Website!</h2>\nYou have logged in successfully! <br><a href="profile.php">Check out your profile!</a>'

In [352]:
session.post(requests.compat.urljoin(url, "welcome.php"))
html.text

'\n<h2>Welcome to the Website!</h2>\nYou have logged in successfully! <br><a href="profile.php">Check out your profile!</a>'

In [353]:
url = "https://cyber.inu.ac.kr/login.php"
url = requests.compat.urljoin(url, "login/index.php")
data = {
    "username" : "201401438",
    "password" : "cksdl951!!"
}
html = download('post', url, data=data)
pprint(html.text)

('<!DOCTYPE html>\n'
 '<html  dir="ltr" lang="ko" xml:lang="ko">\n'
 '<head>\n'
 '    <title>인천대학교 학습관리시스템(INU LMS)</title>\n'
 '    <link rel="shortcut icon" '
 'href="http://cyber.inu.ac.kr/theme/image.php?theme=coursemosv2&amp;component=theme&amp;rev=1559009818&amp;image=favicon" '
 '/>\n'
 '    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n'
 '<meta name="keywords" content="moodle, 인천대학교 학습관리시스템(INU LMS)" />\n'
 '<script type="text/javascript">\n'
 '//<![CDATA[\n'
 'var M = {}; M.yui = {};\n'
 'M.pageloadstarttime = new Date();\n'
 'M.cfg = '
 '{"wwwroot":"http:\\/\\/cyber.inu.ac.kr","sesskey":"0ryuJxJLFn","loadingicon":"http:\\/\\/cyber.inu.ac.kr\\/theme\\/image.php?theme=coursemosv2&component=core&rev=1559009818&image=i%2Floading_small","themerev":"1559009818","slasharguments":0,"theme":"coursemosv2","jsrev":"1559009818","admin":"admin","svgicons":true};var '
 'yui1ConfigFn = function(me) '
 "{if(/-skin|reset|fonts|grids|base/.test(me.name)){me.type='css'

In [362]:
url = "https://lms.sunde41.net/auth/login"
url = requests.compat.urljoin(url, "/auth/login")
data = {
    "email" : "skarnd9511@hanmail.net",
    "password" : "cksdl95"
}
html = download('post', url, data=data)
pprint(html.text)

('<!DOCTYPE html>\n'
 '<html lang="ko">\n'
 '<head>\n'
 '    <title>대시보드 :\n'
 '        빅데이터 청년인재 고려대학교 과정 학습관리시스템</title>\n'
 '    <meta charset="UTF-8">\n'
 '    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n'
 '    <meta name="viewport" content="width=device-width, initial-scale=1, '
 'maximum-scale=1, shrink-to-fit=no">\n'
 '    <meta name="robots" content="noindex,nofollow">\n'
 '    <link href="/static/vendors/bootstrap.css?v=1.1" rel="stylesheet">\n'
 '    <link href="/static/vendors/main.css?v=1.1" rel="stylesheet">\n'
 '    <link href="/static/vendors/icon.css?v=1.0" rel="stylesheet">\n'
 '    <link href="/static/vendors/preloaders.css?v=1.0" rel="stylesheet">    '
 '<link href="/static/vendors/fullcalendar.css?v=1.0" rel="stylesheet">\n'
 '    <link href="/static/vendors/style.css?v=1.8" rel="stylesheet">\n'
 '    <link rel="icon" href="/static/favicon/cropped-m_logo-32x32.png" '
 'sizes="32x32">\n'
 '    <link rel="icon" href="/static/favicon/cropped-m_logo

 '                                    </a>\n'
 '                                </li>\n'
 '                                <li class="nav-item m-tabs__item">\n'
 '                                    <a class="nav-link m-tabs__link" '
 'data-toggle="tab" href="#table2_content"\n'
 '                                       role="tab">\n'
 '                                        완료\n'
 '                                    </a>\n'
 '                                </li>\n'
 '                            </ul>\n'
 '                        </div>\n'
 '                    </div>\n'
 '                    <div class="m-portlet__body m--padding-15">\n'
 '                        <div class="tab-content">\n'
 '                            <div class="tab-pane active" '
 'id="table1_content">\n'
 '                                <div class="m-widget2">\n'
 '                                    <table id="todolist1" class="table '
 'table-no-bordered">\n'
 '                                        <thead

-------------------

In [364]:
url = "https://www.google.com/"
url = requests.compat.urljoin(url, "/search")
param = {
    "q" : "박보영",
}
html = download('get', url, param=param)
pprint(html.text)

('<!doctype html><html lang="ko"><head><meta charset="UTF-8"><meta '
 'content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" '
 'itemprop="image"><title>&#48149;&#48372;&#50689; - Google '
 '&#44160;&#49353;</title><script '
 'nonce="KBmrFUwetCjlWqVe3oKXVw==">(function(){var '
 'a=window.performance;window.start=(new Date).getTime();a:{var '
 'b=window;if(a){var c=a.timing;if(c){var '
 'd=c.navigationStart,e=c.responseStart;if(e>d&&e<=window.start){window.start=e;b.wsrt=e-d;break '
 'a}}a.now&&(b.wsrt=Math.floor(a.now()))}}window.google=window.google||{};google.aft=function(f){f.setAttribute("data-iml",+new '
 'Date)};}).call(this);(function(){var '
 'c=[],e=0;window.ping=function(b){-1==b.indexOf("&zx")&&(b+="&zx="+(new '
 'Date).getTime());var a=new '
 'Image,d=e++;c[d]=a;a.onerror=a.onload=a.onabort=function(){delete '
 'c[d]};a.src=b};}).call(this);</script><style>body{margin:0 '
 'auto;max-width:736px;padding:0 '
 '8px}a{color:#1967D2;text-decoration:none;tap-high