In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import requests

In [11]:
response = requests.get('https://api.github.com')

In [14]:
response.status_code  # 200 is success status, e.g. 404 is not found

if response:  # __bool__ is an overloaded method in response class
    'Success'
# __bool__ won't just compare status to 200, there are other codes that are also considered success

200

'Success'

In [16]:
response.content

type(response.content)  # response content is given in bytes

b'{\n  "current_user_url": "https://api.github.com/user",\n  "current_user_authorizations_html_url": "https://github.com/settings/connections/applications{/client_id}",\n  "authorizations_url": "https://api.github.com/authorizations",\n  "code_search_url": "https://api.github.com/search/code?q={query}{&page,per_page,sort,order}",\n  "commit_search_url": "https://api.github.com/search/commits?q={query}{&page,per_page,sort,order}",\n  "emails_url": "https://api.github.com/user/emails",\n  "emojis_url": "https://api.github.com/emojis",\n  "events_url": "https://api.github.com/events",\n  "feeds_url": "https://api.github.com/feeds",\n  "followers_url": "https://api.github.com/user/followers",\n  "following_url": "https://api.github.com/user/following{/target}",\n  "gists_url": "https://api.github.com/gists{/gist_id}",\n  "hub_url": "https://api.github.com/hub",\n  "issue_search_url": "https://api.github.com/search/issues?q={query}{&page,per_page,sort,order}",\n  "issues_url": "https://api.

bytes

In [18]:
# response message is called payload (generally some incomplete message in the request or response)
response.text  # converts content to string

# requests will infer the encoding scheme but can pass it directly via 
response.encoding = 'utf-8'

'{\n  "current_user_url": "https://api.github.com/user",\n  "current_user_authorizations_html_url": "https://github.com/settings/connections/applications{/client_id}",\n  "authorizations_url": "https://api.github.com/authorizations",\n  "code_search_url": "https://api.github.com/search/code?q={query}{&page,per_page,sort,order}",\n  "commit_search_url": "https://api.github.com/search/commits?q={query}{&page,per_page,sort,order}",\n  "emails_url": "https://api.github.com/user/emails",\n  "emojis_url": "https://api.github.com/emojis",\n  "events_url": "https://api.github.com/events",\n  "feeds_url": "https://api.github.com/feeds",\n  "followers_url": "https://api.github.com/user/followers",\n  "following_url": "https://api.github.com/user/following{/target}",\n  "gists_url": "https://api.github.com/gists{/gist_id}",\n  "hub_url": "https://api.github.com/hub",\n  "issue_search_url": "https://api.github.com/search/issues?q={query}{&page,per_page,sort,order}",\n  "issues_url": "https://api.g

In [19]:
# response is a serialized json, to get dict can use json.loads() or simpler
response.json()

{'current_user_url': 'https://api.github.com/user',
 'current_user_authorizations_html_url': 'https://github.com/settings/connections/applications{/client_id}',
 'authorizations_url': 'https://api.github.com/authorizations',
 'code_search_url': 'https://api.github.com/search/code?q={query}{&page,per_page,sort,order}',
 'commit_search_url': 'https://api.github.com/search/commits?q={query}{&page,per_page,sort,order}',
 'emails_url': 'https://api.github.com/user/emails',
 'emojis_url': 'https://api.github.com/emojis',
 'events_url': 'https://api.github.com/events',
 'feeds_url': 'https://api.github.com/feeds',
 'followers_url': 'https://api.github.com/user/followers',
 'following_url': 'https://api.github.com/user/following{/target}',
 'gists_url': 'https://api.github.com/gists{/gist_id}',
 'hub_url': 'https://api.github.com/hub',
 'issue_search_url': 'https://api.github.com/search/issues?q={query}{&page,per_page,sort,order}',
 'issues_url': 'https://api.github.com/issues',
 'keys_url': '

In [20]:
# use headers to get metadata, e.g. to get the content type
response.headers['Content-Type']

'application/json; charset=utf-8'

In [23]:
# add parameters to GET request

# Search GitHub's repositories for requests
response = requests.get(
    'https://api.github.com/search/repositories',
    params={'q': 'requests+language:python'},
)

POST, PUT, and the less common PATCH requests pass their data through the message body rather than through parameters in the query string. Using requests, you’ll pass the payload to the corresponding function’s data parameter.

data takes a dictionary, a list of tuples, bytes, or a file-like object. You’ll want to adapt the data you send in the body of your request to the specific needs of the service you’re interacting with.

In [26]:
# using POST

response = requests.post('https://httpbin.org/post', data={'key':'value'})
response.json()  # can inspect POST response in the same way

{'args': {},
 'data': '',
 'files': {},
 'form': {'key': 'value'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '9',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.24.0',
  'X-Amzn-Trace-Id': 'Root=1-5f257bba-e3f25412a49c0c98da1c4600'},
 'json': None,
 'origin': '83.218.144.179',
 'url': 'https://httpbin.org/post'}

In [28]:
# if need to pass json data - need to use json argument

response = requests.post('https://httpbin.org/post', json={'key':'value'})
response.json()

{'args': {},
 'data': '{"key": "value"}',
 'files': {},
 'form': {},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '16',
  'Content-Type': 'application/json',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.24.0',
  'X-Amzn-Trace-Id': 'Root=1-5f257c9d-bec776989e45d508902f1cd0'},
 'json': {'key': 'value'},
 'origin': '83.218.144.179',
 'url': 'https://httpbin.org/post'}

In [None]:
# authentification : pass (username, password) tuple to auth
requests.get('https://api.github.com/user', auth=('username', 'password'))

In [None]:
# Exercise : extend requests Response and Session classes to allow .pandas() method, and a retry logic (see Han's code)

## Web scrapping

### Beautiful soup

In [2]:
from bs4 import BeautifulSoup

In [3]:
# download page content using requests library
page = requests.get('https://www.iban.com/country-codes')

# create instance of BeautifulSoup class
soup = BeautifulSoup(page.content, 'html.parser')  # http response is given by html, thus 'html.parser' is used

In [4]:
# soup instance allows to navigate html page
# soup.prettify() for clearer html printing (than just page.content or page.text)

children = list(soup.children)  # tags are nested, so we can access children tags of the global tag
len(children)

[(type(item), item.name) for item in children]  # children elements all have different types

7

[(bs4.element.Doctype, None),
 (bs4.element.NavigableString, None),
 (bs4.element.Comment, None),
 (bs4.element.NavigableString, None),
 (bs4.element.Comment, None),
 (bs4.element.Tag, 'html'),
 (bs4.element.NavigableString, None)]

In [5]:
# Tag object is the most important object type
html = children[5]

# can call .children again (since it's a Tag object and contains other tags)
[(type(item), item.name) for item in list(html.children)]  

[(bs4.element.Comment, None),
 (bs4.element.NavigableString, None),
 (bs4.element.Tag, 'head'),
 (bs4.element.NavigableString, None),
 (bs4.element.Tag, 'body'),
 (bs4.element.NavigableString, None)]

In [6]:
# find all instances of a tag
soup.find_all('p')

soup.find('p')  # just the first one

[<p>
 This is a complete list of all country ISO codes as described in the ISO 3166 international standard.<br/>
 These codes are used throughout the IT industry by computer systems and software to ease the identification of country names.<br/>
 We have compiled them in the quick reference table below in order to help our clients do quick conversions from the numeric or 2 letter code to any country name.
 </p>]

<p>
This is a complete list of all country ISO codes as described in the ISO 3166 international standard.<br/>
These codes are used throughout the IT industry by computer systems and software to ease the identification of country names.<br/>
We have compiled them in the quick reference table below in order to help our clients do quick conversions from the numeric or 2 letter code to any country name.
</p>

In [7]:
# can use class or id information in the search

soup.find_all('a')[:5]

soup.find_all('a', class_='no-border')  # class_ is used since the word class is obviously taken already

[<a class="no-border" href="/contact" title="Contact Us"><i class="fa fa-envelope"></i> Contact</a>,
 <a class="language" href="#"><img alt="en" class="flag flag-gb" height="11" src="/images/icon/blank.png" width="16"/> EN</a>,
 <a data-lang-id="de_DE" href="https://de.iban.com/" title="Deutsch"><img alt="de" class="flag flag-de" height="11" src="/images/icon/blank.png" width="16"/> Deutsch</a>,
 <a data-lang-id="fr_FR" href="https://fr.iban.com/" title="Français"><img alt="fr" class="flag flag-fr" height="11" src="/images/icon/blank.png" width="16"/> Français</a>,
 <a data-lang-id="it_IT" href="https://it.iban.com/" title="Italiano"><img alt="it" class="flag flag-it" height="11" src="/images/icon/blank.png" width="16"/> Italiano</a>]

[<a class="no-border" href="/contact" title="Contact Us"><i class="fa fa-envelope"></i> Contact</a>]

In [8]:
soup.find_all('a', title='Italiano')  # can use any other tag properties for the search

[<a data-lang-id="it_IT" href="https://it.iban.com/" title="Italiano"><img alt="it" class="flag flag-it" height="11" src="/images/icon/blank.png" width="16"/> Italiano</a>]

In [19]:
# use Chrome dev tools to get the tag and properties (like class or id) for the element of interest
# in Chrome Ctrl+Shift+C to open dev tools panel 

table = soup.find('table', id='myTable')


# pandas with the help of lxml package can parse html tables into dataframes
import pandas as pd
df = pd.read_html(str(table))[0]
df

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric
0,Afghanistan,AF,AFG,4
1,Åland Islands,AX,ALA,248
2,Albania,AL,ALB,8
3,Algeria,DZ,DZA,12
4,American Samoa,AS,ASM,16
...,...,...,...,...
244,Wallis and Futuna,WF,WLF,876
245,Western Sahara,EH,ESH,732
246,Yemen,YE,YEM,887
247,Zambia,ZM,ZMB,894


#### Tests from scrappy example

In [20]:
response = requests.get('https://clever-lichterman-044f16.netlify.com/products/taba-cream.1/')
soup = BeautifulSoup(response.content)

In [27]:
soup.select_one('.my-4 span::text')

NotImplementedError: Psuedo-element found at position 10

### Scrapy