# Libraries

In [1]:
import httplib2
import requests
import urllib.request

import json
from pprint import pprint

# Requests
The easiest way to get started

## Get

In [2]:
res = requests.get('http://jsonplaceholder.typicode.com/todos/1')
type(res)

requests.models.Response

In [3]:
res.url

'http://jsonplaceholder.typicode.com/todos/1'

In [4]:
res.status_code

200

In [5]:
%%html
<iframe src="https://en.wikipedia.org/wiki/List_of_HTTP_status_codes"/ width='100%', height='600'>

In [6]:
res.headers

{'Date': 'Thu, 21 Oct 2021 22:36:36 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'x-powered-by': 'Express', 'x-ratelimit-limit': '1000', 'x-ratelimit-remaining': '999', 'x-ratelimit-reset': '1631494143', 'vary': 'Origin, Accept-Encoding', 'access-control-allow-credentials': 'true', 'cache-control': 'max-age=43200', 'pragma': 'no-cache', 'expires': '-1', 'x-content-type-options': 'nosniff', 'etag': 'W/"53-hfEnumeNh6YirfjyjaujcOPPT+s"', 'via': '1.1 vegur', 'CF-Cache-Status': 'HIT', 'Age': '24123', 'Report-To': '{"endpoints":[{"url":"https:\\/\\/a.nel.cloudflare.com\\/report\\/v3?s=whDqcnCA8p6cTZArPtFwPkmJAholr%2FDAQGAFZHUED8h%2FZFgZWqHgO%2FXzJbxs5MBLhYmFsNehVYkJn4qne22qu8w3cS%2FLmOSBaF2efnZsDMEzpF0EzybzHJwVmOzUGXP2v6pu1RI8gjHhV1LvmJxj"}],"group":"cf-nel","max_age":604800}', 'NEL': '{"success_fraction":0,"report_to":"cf-nel","max_age":604800}', 'Server': 'cloudflare', 'CF-RAY': '6a1df7392ec41074-ATL', 'Content-Encodin

In [7]:
data = res.text
data

'{\n  "userId": 1,\n  "id": 1,\n  "title": "delectus aut autem",\n  "completed": false\n}'

In [8]:
data = json.loads(res.text)
data

{'completed': False, 'id': 1, 'title': 'delectus aut autem', 'userId': 1}

### with query parameters

In [9]:
%%html
<iframe src="https://en.wikipedia.org/wiki/Special:Search"/ width='100%', height='600'>

In [10]:
query_params = {'search': 'eurostat'}

res = requests.get(url='https://en.wikipedia.org/wiki/Special:Search',
                   params=query_params)

### ❓ Exercise
What's the response's url, status code, headers and body?

In [11]:
## your code

##Post 

In [12]:
post_data = {
    'organization': 'Eurostat',
    'course': 'Data for science, How to scrape the Web'
    }

res = requests.post('http://jsonplaceholder.typicode.com/posts',
                    data=post_data)

print(res)

<Response [201]>


In [13]:
json.loads(res.text)

{'course': 'Data for science, How to scrape the Web',
 'id': 101,
 'organization': 'Eurostat'}

## Redirects

In [14]:
res = requests.get('https://nghttp2.org/httpbin/absolute-redirect/5')

In [15]:
list(map(lambda r : (r.status_code, r.url), res.history))

[(302, 'https://nghttp2.org/httpbin/absolute-redirect/5'),
 (302, 'https://nghttp2.org/httpbin/absolute-redirect/4'),
 (302, 'https://nghttp2.org/httpbin/absolute-redirect/3'),
 (302, 'https://nghttp2.org/httpbin/absolute-redirect/2'),
 (302, 'https://nghttp2.org/httpbin/absolute-redirect/1')]

In [16]:
(res.status_code, res.url)

(200, 'https://nghttp2.org/httpbin/get')

In [17]:
res = requests.get('https://nghttp2.org/httpbin/absolute-redirect/5',
                   allow_redirects=False)

In [18]:
(res.status_code, res.url)

(302, 'https://nghttp2.org/httpbin/absolute-redirect/5')

## Options

In [19]:
res = requests.options('https://nghttp2.org/httpbin/')
res.headers['Access-Control-Allow-Methods']

'GET, POST, PUT, DELETE, PATCH, OPTIONS'

In [20]:
res = requests.options('https://google.com')
res.headers

{'Allow': 'GET, HEAD', 'Date': 'Thu, 21 Oct 2021 22:36:41 GMT', 'Content-Type': 'text/html; charset=UTF-8', 'Server': 'gws', 'Content-Length': '1592', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN'}

# UrlLib


## Get

In [21]:
res = urllib.request.urlopen('http://google.com')
res

<http.client.HTTPResponse at 0x7f55e3a3a590>

In [22]:
res.geturl()

'http://www.google.com/'

In [23]:
res.getcode()

200

In [24]:
print(res.info())

Date: Thu, 21 Oct 2021 22:36:41 GMT
Expires: -1
Cache-Control: private, max-age=0
Content-Type: text/html; charset=ISO-8859-1
P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info."
Server: gws
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN
Set-Cookie: 1P_JAR=2021-10-21-22; expires=Sat, 20-Nov-2021 22:36:41 GMT; path=/; domain=.google.com; Secure
Set-Cookie: NID=511=UMgG1ThaRiNNVr6p0uYrI0LtJPS2YijQ-XZnV3gWcZVdn-wp7YrX22jYJAdvVVOKOyJhMmtmr33DpPsfHax75ygZ8EJkb30BhVVd6B6ykOwqNJ7oA7DXeo_mYBcW4aOS55J7yeQ_aejdcQJQwFVZQJ7SgF_olIL9d8UVr4l-eLs; expires=Fri, 22-Apr-2022 22:36:41 GMT; path=/; domain=.google.com; HttpOnly
Accept-Ranges: none
Vary: Accept-Encoding
Connection: close
Transfer-Encoding: chunked




In [25]:
data = res.read()

In [26]:
pprint(data.decode('ISO-8859-1'))

('<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" '
 'lang="en"><head><meta content="Search the world\'s information, including '
 'webpages, images, videos and more. Google has many special features to help '
 'you find exactly what you\'re looking for." name="description"><meta '
 'content="noodp" name="robots"><meta content="text/html; charset=UTF-8" '
 'http-equiv="Content-Type"><meta '
 'content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" '
 'itemprop="image"><title>Google</title><script '
 'nonce="IhIpBXNP5rfJnxl2SDY9gQ==">(function(){window.google={kEI:\'eetxYZ3DHdiDwbkPx42dsAk\',kEXPI:\'0,1302534,56875,6058,207,4804,2316,383,246,5,1354,4936,314,1122516,1197731,658,25,316646,12207,51223,16115,28684,17572,4859,1361,9290,3021,2823,14765,4020,978,13228,3847,4192,6431,1141,13620,4282,2780,917,5081,1593,1279,2212,239,291,149,1103,840,1983,214,4100,108,3406,606,2025,2296,14677,3219,2845,7,12354,5096,14396,1926,906,2,940,15325,432,3,1590,1,5445

## Options

In [27]:
req = urllib.request.Request('https://httpbin.org', method='OPTIONS')
res = urllib.request.urlopen(req)
res

<http.client.HTTPResponse at 0x7f55e3a4a350>

In [28]:
print(res.info())

Date: Thu, 21 Oct 2021 22:36:41 GMT
Content-Type: text/html; charset=utf-8
Content-Length: 0
Connection: close
Server: gunicorn/19.9.0
Allow: GET, OPTIONS, HEAD
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true
Access-Control-Allow-Methods: GET, POST, PUT, DELETE, PATCH, OPTIONS
Access-Control-Max-Age: 3600




## Post
parameters need to be encoded

In [29]:
post_data = urllib.parse.urlencode({
    'organization': 'Eurostat',
    'course': 'Data for science, How to scrape the Web'
    }).encode('ascii')

print(post_data)

req = urllib.request.Request('https://nghttp2.org/httpbin/post',
                             method='POST',
                             data=post_data,
                             headers={'content-type': 'application/json'})

res = urllib.request.urlopen(req)

print(res.info())

pprint(res.read().decode('utf-8'))

b'organization=Eurostat&course=Data+for+science%2C+How+to+scrape+the+Web'
Date: Thu, 21 Oct 2021 22:36:42 GMT
Content-Type: application/json
Content-Length: 448
Access-Control-Allow-Origin: *
Access-Control-Allow-Credentials: true
X-Backend-Header-Rtt: 0.003703
Strict-Transport-Security: max-age=31536000
Connection: close
Alt-Svc: h3=":443"; ma=3600, h3-29=":443"; ma=3600
Server: nghttpx
Via: 1.1 nghttpx
x-frame-options: SAMEORIGIN
x-xss-protection: 1; mode=block
x-content-type-options: nosniff


('{\n'
 '  "args": {}, \n'
 '  "data": '
 '"organization=Eurostat&course=Data+for+science%2C+How+to+scrape+the+Web", \n'
 '  "files": {}, \n'
 '  "form": {}, \n'
 '  "headers": {\n'
 '    "Accept-Encoding": "identity", \n'
 '    "Connection": "close", \n'
 '    "Content-Length": "70", \n'
 '    "Content-Type": "application/json", \n'
 '    "Host": "nghttp2.org", \n'
 '    "User-Agent": "Python-urllib/3.7"\n'
 '  }, \n'
 '  "json": null, \n'
 '  "origin": "35.190.180.210", \n'
 '  "url": "https

## Error Handling

In [30]:
from urllib.error import HTTPError

try:
  with urllib.request.urlopen('https://google.com/search?q=eurostat') as res:
    pprint(res.read().decode('ISO-8859-1'))
except HTTPError as e:
  print(e.reason, e.code)

Forbidden 403


In [31]:
try:
  with urllib.request.urlopen('https://google.co/search') as res:
    pprint(res.read().decode('ISO-8859-1'))
except HTTPError as e:
  print(e.reason, e.code)

Not Found 404


## Url Parsing

In [32]:
urllib.parse.urlparse('http://google.com/search?q=eurostat')

ParseResult(scheme='http', netloc='google.com', path='/search', params='', query='q=eurostat', fragment='')

# Http Client
For advanced users

In [33]:
http = httplib2.Http()

## Get

In [34]:
res, data = http.request('https://nghttp2.org/httpbin/')

### Response Header

In [35]:
pprint(res)

{'access-control-allow-credentials': 'true',
 'access-control-allow-origin': '*',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '9649',
 'content-location': 'https://nghttp2.org/httpbin/',
 'content-type': 'text/html; charset=utf-8',
 'date': 'Thu, 21 Oct 2021 22:36:43 GMT',
 'server': 'nghttpx',
 'status': '200',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.004149',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}


### Resonse Body

In [36]:
print(len(data), type(data))

9649 <class 'bytes'>


In [37]:
html = data.decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <title>httpbin.org</title>
    <link href="https://fonts.googleapis.com/css?family=Open+Sans:400,700|Source+Code+Pro:300,600|Titillium+Web:400,600,700"
        rel="stylesheet">
    <link rel="stylesheet" type="text/css" href="/httpbin/flasgger_static/swagger-ui.css">
    <link rel="icon" type="image/png" href="/httpbin/static/favicon.ico" sizes="64x64 32x32 16x16" />
    <style>
        html {
            box-sizing: border-box;
            overflow: -moz-scrollbars-vertical;
            overflow-y: scroll;
        }

        *,
        *:before,
        *:after {
            box-sizing: inherit;
        }

        body {
            margin: 0;
            background: #fafafa;
        }
    </style>
</head>

<body>
    <a href="https://github.com/requests/httpbin" class="github-corner" aria-label="View source on Github">
        <svg width="80" height="80" viewBox="0 0 250 250" style="fill:#151513; color:#fff; pos

## Options

In [38]:
res, _ = http.request('https://nghttp2.org/httpbin/', method='OPTIONS')
res

{'access-control-allow-credentials': 'true',
 'access-control-allow-methods': 'GET, POST, PUT, DELETE, PATCH, OPTIONS',
 'access-control-allow-origin': '*',
 'access-control-max-age': '3600',
 'allow': 'HEAD, OPTIONS, GET',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '0',
 'content-location': 'https://nghttp2.org/httpbin/',
 'content-type': 'text/html; charset=utf-8',
 'date': 'Thu, 21 Oct 2021 22:36:44 GMT',
 'server': 'nghttpx',
 'status': '200',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.003916',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}

## Post

In [39]:
post_data = json.dumps({
    'organization': 'Eurostat',
    'course': 'Data for science, How to scrape the Web'
})

res, data = http.request('https://nghttp2.org/httpbin/post',
                         method='POST',
                         body=post_data,
                         headers={'content-type': 'application/json'})

res

{'access-control-allow-credentials': 'true',
 'access-control-allow-origin': '*',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '546',
 'content-type': 'application/json',
 'date': 'Thu, 21 Oct 2021 22:36:44 GMT',
 'server': 'nghttpx',
 'status': '200',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.004245',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}

In [40]:
print(data.decode('utf-8'))

{
  "args": {}, 
  "data": "{\"organization\": \"Eurostat\", \"course\": \"Data for science, How to scrape the Web\"}", 
  "files": {}, 
  "form": {}, 
  "headers": {
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "81", 
    "Content-Type": "application/json", 
    "Host": "nghttp2.org", 
    "User-Agent": "Python-httplib2/0.17.4 (gzip)"
  }, 
  "json": {
    "course": "Data for science, How to scrape the Web", 
    "organization": "Eurostat"
  }, 
  "origin": "35.190.180.210", 
  "url": "https://nghttp2.org/httpbin/post"
}



## Redirects

In [41]:
res, data = http.request('https://nghttp2.org/httpbin/absolute-redirect/1',
                         method='GET')
pprint(res)

{'access-control-allow-credentials': 'true',
 'access-control-allow-origin': '*',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '233',
 'content-location': 'https://nghttp2.org/httpbin/get',
 'content-type': 'application/json',
 'date': 'Thu, 21 Oct 2021 22:36:45 GMT',
 'server': 'nghttpx',
 'status': '200',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.003065',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}


In [42]:
res.previous

{'access-control-allow-credentials': 'true',
 'access-control-allow-origin': '*',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '269',
 'content-location': 'https://nghttp2.org/httpbin/absolute-redirect/1',
 'content-type': 'text/html; charset=utf-8',
 'date': 'Thu, 21 Oct 2021 22:36:45 GMT',
 'location': 'https://nghttp2.org/httpbin/get',
 'server': 'nghttpx',
 'status': '302',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.003734',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}

In [43]:
http.follow_redirects = False

In [44]:
res, data = http.request('https://nghttp2.org/httpbin/absolute-redirect/1',
                         method='GET')
pprint(res)

{'access-control-allow-credentials': 'true',
 'access-control-allow-origin': '*',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '269',
 'content-type': 'text/html; charset=utf-8',
 'date': 'Thu, 21 Oct 2021 22:36:46 GMT',
 'location': 'https://nghttp2.org/httpbin/get',
 'server': 'nghttpx',
 'status': '302',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.003311',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}


In [45]:
http.follow_redirects = True

In [46]:
res, data = http.request('https://nghttp2.org/httpbin/redirect-to?url=https://ec.europa.eu/eurostat',
                         method='GET')
pprint(res)

{'-content-encoding': 'gzip',
 'cache-control': 'private, no-cache, no-store, must-revalidate',
 'connection': 'close',
 'content-length': '146582',
 'content-location': 'https://ec.europa.eu/eurostat',
 'content-type': 'text/html;charset=UTF-8',
 'date': 'Thu, 21 Oct 2021 22:36:47 GMT',
 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT',
 'liferay-portal': 'Liferay Digital Experience Platform 7.2.10 GA1 (Mueller / '
                   'Build 7210 / May 13, 2019)',
 'pragma': 'no-cache',
 'server': 'Apache',
 'server-timing': 'dtRpid;desc="1548897386"',
 'set-cookie': 'JSESSIONID=BFA6DEC4DE47176305A25B16DB9DA2C0; Path=/eurostat; '
               'HttpOnly, '
               'dtCookie=v_4_srv_25_sn_96BE576F9E9201C3C853DFF20B010136_perc_100000_ol_0_mul_1_app-3Ae63eebabdf39f376_1; '
               'Path=/; Domain=.europa.eu',
 'status': '200',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-oneagent-js-injection': 'true',
 'x-rate-limit-remaining': '119',
 'x-xss-prote

In [47]:
res.previous

{'access-control-allow-credentials': 'true',
 'access-control-allow-origin': '*',
 'alt-svc': 'h3=":443"; ma=3600, h3-29=":443"; ma=3600',
 'connection': 'close',
 'content-length': '0',
 'content-location': 'https://nghttp2.org/httpbin/redirect-to?url=https://ec.europa.eu/eurostat',
 'content-type': 'text/html; charset=utf-8',
 'date': 'Thu, 21 Oct 2021 22:36:47 GMT',
 'location': 'https://ec.europa.eu/eurostat',
 'server': 'nghttpx',
 'status': '302',
 'strict-transport-security': 'max-age=31536000',
 'via': '1.1 nghttpx',
 'x-backend-header-rtt': '0.002922',
 'x-content-type-options': 'nosniff',
 'x-frame-options': 'SAMEORIGIN',
 'x-xss-protection': '1; mode=block'}

# ❓ Exercise

Request the content from "https://ec.europa.eu/eurostat" with the library of your choice.

- Does the site set cookies?
- What is the url to the "grants" subpage?

In [48]:
## your code