## Search the Hengduan Database 
We hope to find many species for which collections are available from multiple disjunct regions sampled between 1997-2004. 

In [375]:
from concurrent.futures import ThreadPoolExecutor
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [16]:
# specimen search page
# http://hengduan.huh.harvard.edu/fieldnotes/specimens/search/search.zpt?
#    dna_collection=on&
#    st=pedicularis&
#    action=search&
#    submit_button=Search

In [15]:
# individual specimen page
# http://hengduan.huh.harvard.edu/fieldnotes/specimens/search/specimen_detail.zpt?
#    specimen_id=718

In [440]:
class Hengduan:
    "Search the Hengduan database and return a dataframe"
    
    def __init__(self, taxon, dna=True, ):
        # class globals
        baseurl = "http://hengduan.huh.harvard.edu/fieldnotes/"
        self.search_url = baseurl + "specimens/search/search.zpt"
        self.specimen_url = baseurl + "specimens/search/specimen_detail.zpt"
        self._soup = None
        
        # class attrs
        self.taxon = taxon
        self.dna = ["on" if True else "off"][0]
        
        self.get_search_data()
        self.fill_data_coords()
        
        
    def search_request(self):
        res = requests.get(
            url=self.search_url, 
            params={
                "dna_collection": self.dna, 
                "st": self.taxon,
                "action": "search", 
                "submit_button": "Search",
            }
        )
        res.raise_for_status()
        return BeautifulSoup(res.text, "html5lib")
       
        
    def get_search_data(self):
        soup = self.search_request()
        table = soup.find('table', attrs={"class": "listing", "id": "angio_table"})
        headers = [header.text for header in table.find_all('th')]
        headers.extend(["specimen-id"])
        rows = []
        for row in table.find_all('tr')[1:]:
            tds = row.find_all('td')
            row = [val.text.strip() for val in tds]
            tmp = [i.a for i in tds][2]
            spid = (tmp.attrs['href'].split("=")[-1])
            row.extend([spid])
            rows.append(row)
        self.data = pd.DataFrame(
            rows, 
            columns=["family", "taxon", "cid", "cdate", "", "sid"]
        )
        self.data['year'] = (
            self.data["cdate"]
             .apply(str.split)
             .apply(lambda x: x[-1])
            )
        self.data = self.data.drop(["cdate", ""], axis=1)
        # add shortname ref
        self.data["shortname"] = (
            self.data
            .taxon
            .apply(str.split)
            .apply("-".join)
        )
        
    def specimen_request(self, spid):
        res = requests.get(
            url=self.specimen_url,
            params={
                "specimen_id": spid,
            }
        )
        res.raise_for_status()
        return BeautifulSoup(res.text, "html5lib")

        
    def get_coordinates(self, specid):
        soup = self.specimen_request(specid)
        text = soup.find(id="locality").find_all("td")[1].text.split("\n")
        descr = " ".join([i.strip() for i in text][1:4])
        tmp0, tmp1 = text[-3].strip().split("°")
        tmp0, tmp1
        tmp1, tmp2 = tmp1.split("\'")
        tmp2 = tmp2.lstrip(";").rstrip(";").replace('"', '')
        point = "-".join([tmp0, tmp1, tmp2])
        eastwest = self.convert_gps(point)

        tmp0, tmp1 = text[-4].strip().split("°")
        tmp1, tmp2 = tmp1.split("\'")
        tmp2 = tmp2.lstrip(";").rstrip(",").replace('"', '')
        point = "-".join([tmp0, tmp1, tmp2])
        northsouth = self.convert_gps(point)
        return (northsouth, eastwest)

        
    @staticmethod
    def convert_gps(tude):
        multiplier = 1 if tude[-1] in ['N', 'E'] else -1
        return multiplier * sum(float(x) / 60 ** n for n, x in enumerate(tude[:-1].split('-')))  
    
    
    def fill_data_coords(self):
        with ThreadPoolExecutor(max_workers=4) as executor:
            jobs = [executor.submit(
                self.get_coordinates, specid) for specid in self.data.sid]
            res = [i.result() for i in jobs]
        
        self.data['latitude'] = [i[0] for i in res]
        self.data['longitude'] = [i[1] for i in res]
        
        
    def filter_by_year(self, year):
        try:
            return (
             self.data[self.data.year.astype(int) <= 2000]
             .sort_values(by=["year", "shortname"])
             .groupby('shortname')
             .apply(len)
             .sort_values(ascending=False)
            ).head(20)
        except TypeError:
            return None

### Working example for one species

In [377]:
peds = Hengduan("Pedicularis")
peds.data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis,27501,384,1997,30.040278,101.838333
1,Orobanchaceae,Pedicularis,27567,452,1997,30.015278,101.859444
2,Orobanchaceae,Pedicularis,28595,1486,1998,29.108056,99.907778
3,Orobanchaceae,Pedicularis,29339,2234,1998,27.375,99.966667
4,Orobanchaceae,Pedicularis,34055,20686,2005,31.403889,99.966111


In [432]:
sauss = Hengduan("Saussurea")
sauss.data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Asteraceae,Saussurea,31125,15963,2004,29.710833,98.0025
1,Asteraceae,Saussurea,31932,16770,2004,31.400278,96.679722
2,Asteraceae,Saussurea,31948,16786,2004,31.400278,96.679722
3,Asteraceae,Saussurea,33370,19953,2005,29.153056,101.406944
4,Asteraceae,Saussurea,33510,20168,2005,33.14,97.498056


In [441]:
coll = Hengduan("Lonicera rupicola")
coll.data.head()

Unnamed: 0,family,taxon,cid,sid,year,shortname,latitude,longitude
0,Caprifoliaceae,Lonicera\n rupicola,27512,395,1997,Lonicera-rupicola,30.040278,101.838333
1,Caprifoliaceae,Lonicera\n rupicola,27770,656,1997,Lonicera-rupicola,31.66,100.712778
2,Caprifoliaceae,Lonicera\n rupicola,28126,1015,1998,Lonicera-rupicola,29.14,100.157222
3,Caprifoliaceae,Lonicera\n rupicola,29648,5500,2000,Lonicera-rupicola,30.171111,97.333611
4,Caprifoliaceae,Lonicera\n rupicola,30723,15561,2004,Lonicera-rupicola,29.1,99.692222


In [444]:
coll.filter_by_year(2000)

shortname
Lonicera-rupicola    4
dtype: int64

### Now search many species and filter for early-year collections

In [362]:
records = {}
for taxon in ["tricolor", "longiflora", "siphonantha", "rex", "thamnophila"]:
    records[taxon] = Hengduan("Pedicularis " + taxon)
    

In [366]:
records['tricolor'].data

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n tricolor,34238,20898,2005,31.72,100.719444
1,Orobanchaceae,Pedicularis\n tricolor,34438,21087,2005,32.581944,100.4625
2,Orobanchaceae,Pedicularis\n tricolor,34579,21240,2005,31.975556,100.593889
3,Orobanchaceae,Pedicularis\n tricolor,36572,54938,2006,31.913333,98.815


In [369]:
records["longiflora"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n longiflora,27508,391,1997,30.040278,101.838333
1,Orobanchaceae,Pedicularis\n longiflora,28270,1161,1998,28.746389,100.269167
2,Orobanchaceae,Pedicularis\n longiflora,29506,5358,2000,29.646667,98.133611
3,Orobanchaceae,Pedicularis\n longiflora,31601,16439,2004,31.644722,98.451667
4,Orobanchaceae,Pedicularis\n longiflora,33585,20243,2005,33.052778,98.003611


In [371]:
records["siphonantha"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n siphonantha\n ...,28187,1078,1998,29.133889,100.0425
1,Orobanchaceae,Pedicularis\n siphonantha,28272,1163,1998,28.746389,100.269167
2,Orobanchaceae,Pedicularis\n siphonantha,30695,15533,2004,29.1025,99.670833
3,Orobanchaceae,Pedicularis\n siphonantha,32708,19229,2005,28.96,102.103611
4,Orobanchaceae,Pedicularis\n siphonantha,33216,19821,2005,29.218889,101.516389


In [373]:
records["rex"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n rex,28416,1307,1998,29.141111,99.928333
1,Orobanchaceae,Pedicularis\n rex,30524,15367,2004,29.075833,99.918889
2,Orobanchaceae,Pedicularis\n rex,32648,19303,2005,28.940278,102.103056
3,Orobanchaceae,Pedicularis\n rex,32831,19418,2005,28.94,102.248889
4,Orobanchaceae,Pedicularis\n rex,32951,19533,2005,28.916389,102.216389


In [374]:
records["thamnophila"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n thamnophila,28967,1862,1998,29.044722,99.711389
1,Orobanchaceae,Pedicularis\n thamnophila,33136,19734,2005,29.213611,101.538611
2,Orobanchaceae,Pedicularis\n thamnophila,33413,19996,2005,29.153056,101.406944
3,Orobanchaceae,Pedicularis\n thamnophila,35236,53600,2006,27.253611,100.153056
