In [1]:
import requests
import pandas as pd
import numpy as np

class Records:
    """
    Returns a Records class instance with GBIF occurrence records stored
    in a pandas DataFrame for a queried taxon between a range of years.
    Parameters:
    -----------
    q: str
        Query taxonomic name.
    interval: tuple
        Range of years to return results for. Should be (min, max) tuple.
    Attributes:
    -----------
    baseurl: The REST API URL for GBIF.org.
    params: The parameter dictionary to filter GBIF search.
    df: Pandas DataFrame with returned records.
    sdf: A view of the 'df' DataFrame selecting only three relevant columns.
    """
    def __init__(self, q, interval, **kwargs):
        # the API url for searching GBIF occurrences
        self.baseurl = "http://api.gbif.org/v1/occurrence/search?"

        # the default REST API options plus user entered args
        self.params = {
            "q": q,
            'year': ",".join([str(i) for i in interval]),
            'basisOfRecord': "PRESERVED_SPECIMEN",
            'kingdom': "Animalia",
            'phylum': "Cnidaria",
            'order': "Scleractinia",
            'hasCoordinate': "true",
            'hasGeospatialIssue': "false",
            "offset": "0",
            "limit": "300",
        }

        # allow users to enter or modify other params using kwargs
        self.params.update(kwargs)

        # run the request query until all records are obtained
        self.df = pd.DataFrame(self._get_all_records())

    @property
    def sdf(self):
        """
        Return a copy of the current .df dataframe selecting only the three
        most generally relevant columns: species, year, and stateProvince.
        This is only meant for viewing and will raise a warning if you try to
        modify it since it is a copy, and thus you would be setting values on
        a selection of a selection. See pandas docs in the warning for detalis.
        """
        return self.df[["order", "species", "country", "stateProvince"]]

    def _get_all_records(self):
        "iterate until end of records"
        data = []
        while 1:
            # make request and store results
            res = requests.get(
                url=self.baseurl,
                params=self.params,
            )

            # check for errors
            res.raise_for_status()

            # increment counter
            self.params["offset"] = str(int(self.params["offset"]) + 300)

            # get data as json list of dicts and add to 'data' list
            idata = res.json()
            data += idata["results"]

            # stop when end of record is reached
            if idata["endOfRecords"]:
                break
        return data

In [2]:
rec = Records("Scleractinia", interval=(1990, 2017))

In [3]:
cleaning = rec.sdf
clean = cleaning[cleaning["species"].str.contains("NaN") == False]
cleanest = clean[clean["order"].str.contains("Scleractinia") == True]
cleanest

Unnamed: 0,order,species,country,stateProvince
423,Scleractinia,Anomocora fecunda,,
474,Scleractinia,Madracis profunda,,
486,Scleractinia,Anomocora fecunda,,
564,Scleractinia,Anomocora fecunda,,
578,Scleractinia,Anomocora fecunda,,
642,Scleractinia,Flabellum chunii,,
661,Scleractinia,Acropora pulchra,Australia,
662,Scleractinia,Acropora pulchra,Australia,
663,Scleractinia,Acropora hyacinthus,Australia,
667,Scleractinia,Acropora pulchra,Australia,


In [64]:
len(clean.index)

5819

In [75]:
rec.sdf

Unnamed: 0,family,species,year,country,stateProvince
0,,,1967,United States,
1,,,1965,United States,
2,,,2012,United States,
3,,,1988,United States,
4,,,1978,United States,
5,,,2011,United States,Florida
6,,,1978,United States,
7,,,1988,United States,
8,,,1978,United States,
9,,,2005,United States,
