# RESTful APIs

In [None]:
import requests
resp = requests.get('http://www.elpais.com/')
resp.content[0:500]

b'<!DOCTYPE html><html lang="es"><head><title>EL PA\xc3\x8dS Edici\xc3\xb3n Am\xc3\xa9rica: el peri\xc3\xb3dico global</title><meta name="lang" content="es"/><meta name="author" content="Ediciones El Pa\xc3\xads"/><meta name="robots" content="index,follow"/><meta name="description" content="Noticias de \xc3\xbaltima hora de Am\xc3\xa9rica Latina, M\xc3\xa9xico, Estados Unidos y la actualidad internacional: pol\xc3\xadtica, econom\xc3\xada, deportes, cultura, sociedad, tecnolog\xc3\xada, gente, opini\xc3\xb3n, viajes, moda, televisi\xc3\xb3n, los blogs y las firmas de EL PA\xc3'

In [None]:
resp.encoding

'utf-8'

In [None]:
resp.text



In [None]:
resp.status_code

200

In [None]:
resp = requests.get('http://api.open-notify.org/iss-now.json')
resp.status_code

200

In [None]:
resp.content

b'{"message": "success", "iss_position": {"latitude": "40.4186", "longitude": "178.9887"}, "timestamp": 1610189306}'

In [None]:
import json
pos = json.loads(resp.content)
pos

{'iss_position': {'latitude': '40.4186', 'longitude': '178.9887'},
 'message': 'success',
 'timestamp': 1610189306}

In [None]:
resp.json()

{'iss_position': {'latitude': '40.4186', 'longitude': '178.9887'},
 'message': 'success',
 'timestamp': 1610189306}

In [None]:
import pandas as pd
pd.read_json('http://api.open-notify.org/iss-now.json')

Unnamed: 0,message,iss_position,timestamp
latitude,success,46.561,2021-01-09 10:51:32
longitude,success,-167.4579,2021-01-09 10:51:32


In [None]:
my_dict = {
    'Chicago' : "Illinois", 
    "Kansas City" : ["Kansas", "Missouri"]
}

In [None]:
def get_iss(lat, long, passes):
  resp = requests.get(f'http://api.open-notify.org/iss-pass.json?lat={lat}&lon={long}&n={passes}')
  content = resp.json()
  return content['response']
get_iss(40.0, 3.5, 5)

[{'duration': 253, 'risetime': 1610222336},
 {'duration': 635, 'risetime': 1610227902},
 {'duration': 631, 'risetime': 1610233713},
 {'duration': 554, 'risetime': 1610239610},
 {'duration': 572, 'risetime': 1610245476}]

In [None]:
pd.DataFrame(get_iss(40.0, 3.5, 5))

Unnamed: 0,duration,risetime
0,253,1610222336
1,635,1610227902
2,631,1610233713
3,554,1610239610
4,572,1610245476


# Web Scrapping

In [None]:
 la_url = 'https://aflcio.org/what-unions-do/social-economic-justice/advocacy/legislative-alerts'

## BeautifulSoup

In [None]:
from bs4 import BeautifulSoup
r = requests.get(la_url)
page = r.content
page[:1000]

b'<!DOCTYPE html>\n<html lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">\n  <head>\n    <meta charset="utf-8" /><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={licenseKey:"5d4af6f314",applicationID:"54360456"};window.NREUM||(NREUM={}),__nr_require=function(e,t,n){function r(n){if(!t[n]){var i=t[n]={exports:{}};e[n][0].call(i.exports,function(t){var i=e[n][1][t];return r(i||t)},i,i.exports)}return t[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(e,t,n){function r(){}function i(e,t,n){return function(){return o(e,[u.now()].concat(c(arguments)),t?nu

### Parsers
```
html.parser
lxml
html5lib
```

In [None]:
soup = BeautifulSoup(page, 'html5lib')
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).loader_config={licenseKey:"5d4af6f314",applicationID:"54360456"};window.NREUM||(NREUM={}),__nr_require=function(e,t,n){function r(n){if(!t[n]){var i=t[n]={exports:{}};e[n][0].call(i.exports,function(t){var i=e[n][1][t];return r(i||t)},i,i.exports)}return t[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(e,t,n){function r(){}function i(e,t,n){return function(){return o(e,[u.now()].concat(c(arguments)),t?null

In [None]:
help(soup.find_all)

Help on method find_all in module bs4.element:

find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) method of bs4.BeautifulSoup instance
    Extracts a list of Tag objects that match the given
    criteria.  You can specify the name of the Tag and any
    attributes you want the Tag to have.
    
    The value of a key-value pair in the 'attrs' map can be a
    string, a list of strings, a regular expression object, or a
    callable that takes a string and returns whether or not the
    string matches for some custom definition of 'matches'. The
    same is true of the tag name.



In [None]:
alerts = soup.find_all('div', class_ = 'content-details')

In [None]:
type(alerts)

bs4.element.ResultSet

In [None]:
a = []
for alert in alerts:
  a.append({'link': 'http://www.aflcio.org' + alert.a['href'], 
   'title': alert.find('h2', class_ = 'content-title').span.get_text(),
   'time': alert.find('time').get_text()
   })
pd.DataFrame(a)

Unnamed: 0,link,title,time
0,http://www.aflcio.org/about/advocacy/legislati...,House Letter Opposing Broad COVID Liability Sh...,"December 14, 2020"
1,http://www.aflcio.org/about/advocacy/legislati...,Senate Letter Opposing Broad COVID Liability S...,"December 14, 2020"
2,http://www.aflcio.org/about/advocacy/legislati...,Letter Opposing Nomination of Thomas Kirsch to...,"December 10, 2020"
3,http://www.aflcio.org/about/advocacy/legislati...,Letter Supporting the National Defense Authori...,"December 9, 2020"
4,http://www.aflcio.org/about/advocacy/legislati...,Letter Opposing Nomination of Stephen S. Schwa...,"December 9, 2020"
5,http://www.aflcio.org/about/advocacy/legislati...,Letter in Support of the FY 2021 National Defe...,"December 8, 2020"
6,http://www.aflcio.org/about/advocacy/legislati...,Letter Opposing the Nomination of Stephen S. S...,"December 4, 2020"
7,http://www.aflcio.org/about/advocacy/legislati...,Letter Supporting Legislation That Would Moder...,"November 19, 2020"
8,http://www.aflcio.org/about/advocacy/legislati...,Letter Opposing Nomination of Stephen Vaden to...,"November 17, 2020"
9,http://www.aflcio.org/about/advocacy/legislati...,Letter Opposing Nomination of Kathryn Kimball ...,"November 17, 2020"


## Pandas

In [None]:
tables = pd.read_html('https://en.wikipedia.org/wiki/List_of_accidents_and_disasters_by_death_toll')
tables[4].head()

Unnamed: 0,Deaths,Date,Incident
0,20000,30 May 1626,"Wanggongchang Explosion in Beijing, China in t..."
1,3000,18 August 1769,A lightning bolt caused the Brescia explosion ...
2,"3,000?",1 November 1948,Boiler and ammunition explosion aboard an unid...
3,"1,400–2,280",6 March 1862,Ammunition warehouse explodes and kills almost...
4,1950,6 December 1917,"Halifax Explosion in Nova Scotia, Canada[68]"


## Selenium

http://books.toscrape.com/

In [1]:
!pip install kora -q

[K     |████████████████████████████████| 61kB 3.2MB/s 
[K     |████████████████████████████████| 61kB 4.4MB/s 
[?25h

In [2]:
import re
from kora.selenium import wd

In [3]:
def parseBook(book_link):
  wd.get(book_link)
  text_to_num = {'One': 1, 'Two': 2, 'Three': 3, 'Four': 4, 'Five': 5}
  book = {
      "Title": wd.find_element_by_xpath("//*[@id='content_inner']/article/div[1]/div[2]/h1").text,
      "Description": wd.find_element_by_xpath("//*[@id='content_inner']/article/p").text,
      "Rating": text_to_num[re.search('.* (One|Two|Three|Four|Five)', wd.find_element_by_xpath("//*[@id='content_inner']/article/div[1]/div[2]/p[3]").get_attribute("class")).group(1)],
      "Stock Status": wd.find_element_by_xpath("//*[@id='content_inner']/article/div[1]/div[2]/p[2]").text,
      "Price": wd.find_element_by_xpath("//*[@id='content_inner']/article/div[1]/div[2]/p[1]").text,
      "Tax": wd.find_element_by_xpath("//*[@id='content_inner']/article/table/tbody/tr[5]/td").text,
      "UPC": wd.find_element_by_xpath("//*[@id='content_inner']/article/table/tbody/tr[1]/td").text,
      "Category": wd.find_element_by_xpath("//*[@id='default']/div/div/ul/li[3]/a").text
  }
  return book

In [None]:
def parsePage(page_link):
  books = []
  books_link = []
  wd.get(page_link)
  product_pods = wd.find_elements_by_class_name('product_pod')
  for product_pod in product_pods:
    books_link.append(product_pod.find_element_by_tag_name('h3').find_element_by_tag_name('a').get_property('href'))
  for book_link in books_link:
    books.append(parseBook(book_link))
  return books

books = parsePage('http://books.toscrape.com/catalogue/category/books_1/page-1.html')
books

In [13]:
import requests
def parseSite(pages = -1):
  index = 1
  link_template = 'http://books.toscrape.com/catalogue/category/books_1/page-{index}.html'
  books = []
  while True:
    if pages > 0 and index > pages:
      break
    page_link = link_template.format(index = index)
    print(f'Processing: {page_link}')
    response = requests.get(page_link)
    if not response.ok:
      break
    books = books + parsePage(page_link)
    index += 1
  return books
   
books = parseSite()
print(f'Total books: {len(books)}')

Processing: http://books.toscrape.com/catalogue/category/books_1/page-1.html
Processing: http://books.toscrape.com/catalogue/category/books_1/page-2.html
Processing: http://books.toscrape.com/catalogue/category/books_1/page-3.html
Processing: http://books.toscrape.com/catalogue/category/books_1/page-4.html
Processing: http://books.toscrape.com/catalogue/category/books_1/page-5.html


KeyboardInterrupt: ignored