# Extract data from github apis page
- https://github.com/toddmotto/public-apis

## Import modules 

In [1]:
import requests
import bs4
import pandas
import numpy
import datetime

##  Read in data using requests and pandas

In [2]:
path = "https://github.com/toddmotto/public-apis"

In [3]:
request = requests.get(path)

In [4]:
request.status_code

200

In [5]:
requesttext = request.text


In [6]:
data = pandas.read_html(requesttext)

## Check shape of each dataframe
- Each dataframe appears to have 6 columns except the first, so lets remove it.

### Found 47 tables

In [7]:
len(data)

47

In [8]:
for dataframe in data:
    print(dataframe.shape)

(6, 4)
(11, 6)
(4, 6)
(6, 6)
(8, 6)
(7, 6)
(9, 6)
(9, 6)
(6, 6)
(3, 6)
(20, 6)
(6, 6)
(8, 6)
(39, 6)
(3, 6)
(10, 6)
(5, 6)
(3, 6)
(9, 6)
(13, 6)
(6, 6)
(33, 6)
(35, 6)
(23, 6)
(11, 6)
(15, 6)
(6, 6)
(19, 6)
(8, 6)
(23, 6)
(3, 6)
(3, 6)
(8, 6)
(11, 6)
(16, 6)
(4, 6)
(4, 6)
(24, 6)
(16, 6)
(14, 6)
(5, 6)
(3, 6)
(50, 6)
(4, 6)
(4, 6)
(13, 6)
(9, 6)


In [9]:
data_list = data[1:]

In [10]:
full_data = pandas.concat(data_list, axis = 0)

In [11]:
full_data.shape

(549, 6)

### Inspecting the full data

In [31]:
full_data.head()

Unnamed: 0,API,Description,Auth,HTTPS,CORS,Link
0,Cats,Pictures of cats from Tumblr,No,Yes,Unknown,Go!
1,Dogs,Based on the Stanford Dogs Dataset,No,Yes,Unknown,Go!
2,HTTPCat,Cat for every HTTP Status,No,Yes,Unknown,Go!
3,IUCN,IUCN Red List of Threatened Species,apiKey,No,Unknown,Go!
4,Movebank,Movement and Migration data of animals,No,Yes,Unknown,Go!


### Take subset of full data containing only no auth

In [13]:
only_no_auth_data = full_data[full_data["Auth"] == "No"]

In [14]:
only_no_auth_data.shape

(257, 6)

### Links do not work using pandas method, switch to bs4 method

In [32]:
only_no_auth_data.head()

Unnamed: 0,API,Description,Auth,HTTPS,CORS,Link
0,Cats,Pictures of cats from Tumblr,No,Yes,Unknown,Go!
1,Dogs,Based on the Stanford Dogs Dataset,No,Yes,Unknown,Go!
2,HTTPCat,Cat for every HTTP Status,No,Yes,Unknown,Go!
4,Movebank,Movement and Migration data of animals,No,Yes,Unknown,Go!
6,RandomCat,Random pictures of cats,No,Yes,Yes,Go!


### Create bs4 soup object

In [16]:
soup = bs4.BeautifulSoup(requesttext, "lxml")

In [17]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="https://assets-cdn.github.com" rel="dns-prefetch"/>
  <link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
  <link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
  <link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
  <link crossorigin="anonymous" href="https://assets-cdn.github.com/assets/frameworks-73f533b7cc08a9d040e601cfd38fa585.css" integrity="sha512-pCRDtdb3GlUU48h+oRJVA8f0GddrLnU97wB7mHQ7q6c40vMbMMZsFdk0IMhkUFRqw1M/y4EkWxtaKwfeFezOkQ==" media="all" rel="stylesheet"/>
  <link crossorigin="anonymous" href="https://assets-cdn.github.com/assets/github-97d6f28221b9753a523a134a3f7f3a3b.css" integrity="sha512-nUvkp2PexYbw8

In [18]:
tables = soup.find_all("table")

### Found 47 tables 

In [19]:
len(tables)

47

### The first table did not contain api information so lets remove it again

In [22]:
tables = tables[1:]

### Create dictionary to store data from tables from html soup, using nested loop to take out contents of each table

In [26]:
dictionary = {"API":[],
             "DESC":[],
             "AUTH":[],
             "HTTPS":[],
             "CORS":[],
             "LINK":[]}



for dataset_index in range(len(tables)):
    dataset = tables[dataset_index]
    row_table_level = dataset.find("tbody").find_all("tr")
    
    for row in row_table_level:
        cells = row.find_all("td")
        
        for index in range(len(cells)):
            #print(dataset_index, index, cells[index])
            if index == 0:
                dictionary["API"].append(cells[index].text)
            elif index == 1:
                dictionary["DESC"].append(cells[index].text)
            elif index == 2:
                dictionary["AUTH"].append(cells[index].text)
            elif index == 3:
                dictionary["HTTPS"].append(cells[index].text)
            elif index == 4:
                dictionary["CORS"].append(cells[index].text)
            elif index == 5:
                dictionary["LINK"].append(cells[index].find("a").get("href"))
                




In [27]:
api_data = pandas.DataFrame(dictionary)

In [28]:
api_data

Unnamed: 0,API,DESC,AUTH,HTTPS,CORS,LINK
0,Cats,Pictures of cats from Tumblr,No,Yes,Unknown,https://thecatapi.com/docs.html
1,Dogs,Based on the Stanford Dogs Dataset,No,Yes,Unknown,https://dog.ceo/dog-api/
2,HTTPCat,Cat for every HTTP Status,No,Yes,Unknown,https://http.cat/
3,IUCN,IUCN Red List of Threatened Species,apiKey,No,Unknown,http://apiv3.iucnredlist.org/api/v3/docs
4,Movebank,Movement and Migration data of animals,No,Yes,Unknown,https://github.com/movebank/movebank-api-doc
5,Petfinder,Adoption,apiKey,Yes,Unknown,https://www.petfinder.com/developers/api-docs/
6,RandomCat,Random pictures of cats,No,Yes,Yes,https://aws.random.cat/meow
7,RandomDog,Random pictures of dogs,No,Yes,Yes,https://random.dog/woof.json
8,RandomFox,Random pictures of foxes,No,Yes,Yes,https://randomfox.ca/floof/
9,RescueGroups,Adoption,No,Yes,Unknown,https://userguide.rescuegroups.org/display/API...


### Grab subset of api_data where auth is not required

In [30]:
subset_no = api_data[api_data["AUTH"] == "No"]
subset_no

Unnamed: 0,API,DESC,AUTH,HTTPS,CORS,LINK
0,Cats,Pictures of cats from Tumblr,No,Yes,Unknown,https://thecatapi.com/docs.html
1,Dogs,Based on the Stanford Dogs Dataset,No,Yes,Unknown,https://dog.ceo/dog-api/
2,HTTPCat,Cat for every HTTP Status,No,Yes,Unknown,https://http.cat/
4,Movebank,Movement and Migration data of animals,No,Yes,Unknown,https://github.com/movebank/movebank-api-doc
6,RandomCat,Random pictures of cats,No,Yes,Yes,https://aws.random.cat/meow
7,RandomDog,Random pictures of dogs,No,Yes,Yes,https://random.dog/woof.json
8,RandomFox,Random pictures of foxes,No,Yes,Yes,https://randomfox.ca/floof/
9,RescueGroups,Adoption,No,Yes,Unknown,https://userguide.rescuegroups.org/display/API...
10,Shibe.Online,"Random pictures of Shibu Inu, cats or birds",No,No,Unknown,http://shibe.online/
12,Jikan,Unofficial MyAnimeList API,No,Yes,Yes,https://jikan.moe
