## Search the Hengduan Database 
We hope to find many species for which collections are available from multiple disjunct regions sampled between 1997-2004. 

In [178]:
# conda install pandas
# conda install beautifulsoup4
# pip install folium

In [179]:
from concurrent.futures import ThreadPoolExecutor
import requests
import pandas as pd
from bs4 import BeautifulSoup
import folium

### The Hengduan API structure

In [2]:
# specimen search page
# http://hengduan.huh.harvard.edu/fieldnotes/specimens/search/search.zpt?
#    dna_collection=on&
#    st=pedicularis&
#    action=search&
#    submit_button=Search

In [3]:
# individual specimen page
# http://hengduan.huh.harvard.edu/fieldnotes/specimens/search/specimen_detail.zpt?
#    specimen_id=718

### A class object to search and parse Hengduan data

In [353]:
class Hengduan:
    "Search the Hengduan database and return a dataframe"
    
    def __init__(self, taxon, dna=True):
        # class globals
        baseurl = "http://hengduan.huh.harvard.edu/fieldnotes/"
        self.search_url = baseurl + "specimens/search/search.zpt"
        self.specimen_url = baseurl + "specimens/search/specimen_detail.zpt"
        
        # class attrs
        self.taxon = taxon
        self.dna = ["on" if True else "off"][0]
        
        # do search and fill database
        self._get_search_data()
        self._fill_data_coords()
        
        
    def _search_request(self):
        res = requests.get(
            url=self.search_url, 
            params={
                "dna_collection": self.dna, 
                "st": self.taxon,
                "action": "search", 
                "submit_button": "Search",
            }
        )
        res.raise_for_status()
        return BeautifulSoup(res.text, "html5lib")
       
        
    def _get_search_data(self):
        soup = self._search_request()
        table = soup.find('table', attrs={"class": "listing", "id": "angio_table"})
        headers = [header.text for header in table.find_all('th')]
        headers.extend(["specimen-id"])
        rows = []
        for row in table.find_all('tr')[1:]:
            tds = row.find_all('td')
            row = [val.text.strip() for val in tds]
            tmp = [i.a for i in tds][2]
            spid = (tmp.attrs['href'].split("=")[-1])
            row.extend([spid])
            rows.append(row)
        self.data = pd.DataFrame(
            rows, 
            columns=["family", "taxon", "cid", "cdate", "", "sid"]
        )
        self.data['year'] = (
            self.data["cdate"]
             .apply(str.split)
             .apply(lambda x: x[-1])
            )
        self.data = self.data.drop(["cdate", ""], axis=1)
        # add shortname ref
        self.data["shortname"] = (
            self.data
            .taxon
            .apply(str.split)
            .apply("-".join)
        )
        
    def _specimen_request(self, spid):
        res = requests.get(
            url=self.specimen_url,
            params={
                "specimen_id": spid,
            }
        )
        res.raise_for_status()
        return BeautifulSoup(res.text, "html5lib")

        
    def _get_coordinates(self, specid):
        soup = self._specimen_request(specid)
        text = soup.find(id="locality").find_all("td")[1].text.split("\n")
        descr = " ".join([i.strip() for i in text][1:4])
        tmp0, tmp1 = text[-3].strip().split("°")
        tmp0, tmp1
        tmp1, tmp2 = tmp1.split("\'")
        tmp2 = tmp2.lstrip(";").rstrip(";").replace('"', '')
        point = "-".join([tmp0, tmp1, tmp2])
        eastwest = self._convert_gps(point)

        tmp0, tmp1 = text[-4].strip().split("°")
        tmp1, tmp2 = tmp1.split("\'")
        tmp2 = tmp2.lstrip(";").rstrip(",").replace('"', '')
        point = "-".join([tmp0, tmp1, tmp2])
        northsouth = self._convert_gps(point)
        return (northsouth, eastwest)

        
    @staticmethod
    def _convert_gps(tude):
        multiplier = 1 if tude[-1] in ['N', 'E'] else -1
        return multiplier * sum(float(x) / 60 ** n for n, x in enumerate(tude[:-1].split('-')))  
    
    
    def _fill_data_coords(self):
        with ThreadPoolExecutor(max_workers=4) as executor:
            jobs = [executor.submit(
                self._get_coordinates, specid) for specid in self.data.sid]
            res = [i.result() for i in jobs]
        
        self.data['latitude'] = [i[0] for i in res]
        self.data['longitude'] = [i[1] for i in res]
        
        
    def count_by_maxyear(self, year):
        try:
            return (
             self.data[self.data.year.astype(int) <= year]
             .sort_values(by=["year", "shortname"])
             .groupby('shortname')
             .apply(len)
             .sort_values(ascending=False)
            )
        except TypeError:
            return None
        
            
    def filter_by_max_year(self, year):
        try:
            return (
             self.data[self.data.year.astype(int) <= year]
             .sort_values(by=["year", "shortname"])
            )
        except TypeError:
            return None


### Example usage

In [277]:
# get dataframe
coll = Hengduan(taxon="Ped cranolopha", dna=True)


In [458]:
# see first N records
coll.filter_by_max_year(2004)

Unnamed: 0,family,taxon,cid,sid,year,shortname,latitude,longitude
0,Orobanchaceae,Pedicularis\n cranolopha,27709,595,1997,Pedicularis-cranolopha,30.850278,101.276667
1,Orobanchaceae,Pedicularis\n cranolopha,27821,707,1997,Pedicularis-cranolopha,31.66,100.712778
2,Orobanchaceae,Pedicularis\n cranolopha,28522,1413,1998,Pedicularis-cranolopha,29.141111,99.928333
3,Orobanchaceae,Pedicularis\n cranolopha,28746,1640,1998,Pedicularis-cranolopha,29.103056,99.631944
4,Orobanchaceae,Pedicularis\n cranolopha,30698,15536,2004,Pedicularis-cranolopha,29.1025,99.670833
5,Orobanchaceae,Pedicularis\n cranolopha,31237,16075,2004,Pedicularis-cranolopha,31.332778,98.128611


### A class object to map lat long data

In [496]:
class Map:
    def __init__(self, hlocs=None, nlocs=None):
        
        # store attributes
        self.nlocs = nlocs
        self.hlocs = hlocs
        self.alldata = pd.concat([self.nlocs, self.hlocs])
        
        # create map with zoom x and location y
        self.map = folium.Map(
            #width=350,
            location=[self.alldata.latitude.mean(), self.alldata.longitude.mean()],
            tiles='Stamen Terrain'
        )

        # add points
        if isinstance(nlocs, pd.DataFrame):
            self.add_nlocs()
        if isinstance(hlocs, pd.DataFrame):
            self.add_hlocs()
            
    def draw(self):
        # fit bounds of map to show all points
        latlongtuples = [
            self.alldata[["latitude", "longitude"]].iloc[i].tolist() for i in self.alldata.index
        ]
        self.map.fit_bounds(latlongtuples)
        return self.map
       
    def add_nlocs(self):
        for idx in self.nlocs.index:
            lat, long = self.nlocs.iloc[idx].latitude, self.nlocs.iloc[idx].longitude
            folium.Marker(
                location=[lat, long],
                popup=" | ".join(self.nlocs.iloc[idx][["shortname", "sid", "year"]].tolist()),
                icon=folium.Icon(color="blue", icon="circle", prefix="fa"),
            ).add_to(self.map)
 
    def add_hlocs(self):
        for idx in self.hlocs.index:
            lat, long = self.hlocs.iloc[idx].latitude, self.hlocs.iloc[idx].longitude
            folium.Marker(
                location=[lat, long],
                popup=" | ".join(self.hlocs.iloc[idx][["shortname", "sid", "year"]].tolist()),
                icon=folium.Icon(color="red", icon="circle", prefix="fa"),
            ).add_to(self.map)
 

In [497]:
# draw historical (red) and newer collections (blue)
Map(hlocs=coll.filter_by_max_year(2004), nlocs=coll.data).draw()


In [481]:
co = Hengduan("ped rhinanth", True)
# draw historical (red) and newer collections (blue)
Map(hlocs=co.filter_by_max_year(2002), nlocs=co.data).draw()


### Load our 2018 collections into a DataFrame

In [482]:
class New:
    def __init__(self, df):
        self._load_2018_dataframe(df)
        
    def subset(self, taxon):
        return self.data[self.data.shortname.apply(lambda x: taxon in x)].reset_index()

    def _load_2018_dataframe(self, df):
        # load Data
        data = pd.read_csv(df)

        # drop unidentified
        data = data[data.species_epithet.notna()]

        # select just the columns we want
        data = data[["accession", "locality", "date", "latitude", "longitude", "genus", "species_epithet"]]
        data["shortname"] = data[["genus", "species_epithet"]].apply(lambda x: '-'.join(x), axis=1)

        # convert lat longs to decimals
        def convert_gps(tude):
            deg, _ = tude.split("°")
            minu, _ = _.split("'")
            seco = _.split('"')[0]
            tude = "-".join([deg, minu, seco])
            return sum(float(x) / 60 ** n for n, x in enumerate(tude.split('-')))  

        data["lat"] = data.latitude.apply(convert_gps)
        data["long"] = data.longitude.apply(convert_gps)
        data.head()

        data["year"] = data.date.apply(lambda x: x.split("/")[-1])
        data["sid"] = data["accession"]

        # convert label names to match with Hengduan object
        data = data.drop(columns=["latitude", "longitude", "genus", "species_epithet", "date", "accession"])
        data = data.rename({"lat": "latitude", "long": "longitude"}, axis='columns')
        self.data = data

In [483]:
# create an instance to hold all the 2018 data
newdata = New("/home/deren/Downloads/Fieldnotes-2018 - Sheet1.csv")

In [485]:
# get a subset data set for a specific taxon
newdata.subset("siphonantha")

Unnamed: 0,index,locality,shortname,latitude,longitude,year,sid
0,5,2,Pedicularis-siphonantha,27.959194,99.707167,2018,DE6
1,19,6,Pedicularis-siphonantha,28.585861,99.837222,2018,DE20
2,44,12,Pedicularis-siphonantha,29.123889,99.996667,2018,DE45
3,48,14,Pedicularis-siphonantha,29.144722,100.034778,2018,DE49
4,74,18,Pedicularis-siphonantha,30.010972,100.316167,2018,DE75
5,86,21,Pedicularis-siphonantha,30.168917,100.584333,2018,DE87
6,159,35,Pedicularis-siphonantha,30.711833,101.364333,2018,DE160
7,175,38,Pedicularis-siphonantha,30.826778,101.282278,2018,DE176
8,316,65,Pedicularis-siphonantha,31.887111,99.033778,2018,DE317
9,344,71,Pedicularis-siphonantha,31.044306,98.908806,2018,DE345


In [488]:
# map old and new 
Map(
    nlocs=newdata.subset("longiflora"), 
    hlocs=Hengduan("Ped longiflora", True).filter_by_max_year(2006),
).draw()

In [489]:
# map old and new 
Map(
    nlocs=newdata.subset("cranolopha"), 
    hlocs=Hengduan("Ped crano", True).filter_by_max_year(2006),
).draw()

In [490]:
# map old and new 
Map(
    nlocs=newdata.subset("siphonantha"), 
    hlocs=Hengduan("Ped siphonantha", True).filter_by_max_year(2006),
).draw()

In [493]:
# map old and new 
Map(
    nlocs=newdata.subset("rhinanth"), 
    hlocs=Hengduan("Ped rhinanth", True).filter_by_max_year(2006),
).draw()

In [491]:
# map old and new 
Map(
    nlocs=newdata.subset("rex"), 
    hlocs=Hengduan("Ped rex", True).filter_by_max_year(2006),
).draw()

-------------------------------------------------------

### Query all Pedicularis data (takes a little while)

In [377]:
peds = Hengduan("Pedicularis ")
peds.data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis,27501,384,1997,30.040278,101.838333
1,Orobanchaceae,Pedicularis,27567,452,1997,30.015278,101.859444
2,Orobanchaceae,Pedicularis,28595,1486,1998,29.108056,99.907778
3,Orobanchaceae,Pedicularis,29339,2234,1998,27.375,99.966667
4,Orobanchaceae,Pedicularis,34055,20686,2005,31.403889,99.966111


In [432]:
sauss = Hengduan("Saussurea")
sauss.data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Asteraceae,Saussurea,31125,15963,2004,29.710833,98.0025
1,Asteraceae,Saussurea,31932,16770,2004,31.400278,96.679722
2,Asteraceae,Saussurea,31948,16786,2004,31.400278,96.679722
3,Asteraceae,Saussurea,33370,19953,2005,29.153056,101.406944
4,Asteraceae,Saussurea,33510,20168,2005,33.14,97.498056


In [100]:
coll = Hengduan(taxon="Tibetia", dna=True)
coll.data#.head()

Unnamed: 0,family,taxon,cid,sid,year,shortname,latitude,longitude
0,Fabaceae,Tibetia,817,56444,2006,Tibetia,29.150833,85.981944
1,Fabaceae,Tibetia,44027,119427,2017,Tibetia,30.181111,99.978056
2,Fabaceae,Tibetia\n himalaica,28103,992,1998,Tibetia-himalaica,29.271111,100.082222
3,Fabaceae,Tibetia\n himalaica,28254,1145,1998,Tibetia-himalaica,28.746389,100.269167
4,Fabaceae,Tibetia\n himalaica,28770,1664,1998,Tibetia-himalaica,29.103056,99.631944
5,Fabaceae,Tibetia\n himalaica,37,56507,2006,Tibetia-himalaica,29.703333,92.121389
6,Fabaceae,Tibetia\n himalaica,92,56561,2006,Tibetia-himalaica,30.119167,92.159444
7,Fabaceae,Tibetia\n himalaica,39991,59782,2007,Tibetia-himalaica,33.108611,102.631944
8,Fabaceae,Tibetia\n himalaica,41302,64870,2009,Tibetia-himalaica,31.405833,97.471944
9,Fabaceae,Tibetia\n himalaica,41510,65079,2009,Tibetia-himalaica,30.961111,98.306944


In [152]:
coll.data

Unnamed: 0,family,taxon,cid,sid,year,shortname,latitude,longitude
0,Fabaceae,Tibetia,817,56444,2006,Tibetia,29.150833,85.981944
1,Fabaceae,Tibetia,44027,119427,2017,Tibetia,30.181111,99.978056
2,Fabaceae,Tibetia\n himalaica,28103,992,1998,Tibetia-himalaica,29.271111,100.082222
3,Fabaceae,Tibetia\n himalaica,28254,1145,1998,Tibetia-himalaica,28.746389,100.269167
4,Fabaceae,Tibetia\n himalaica,28770,1664,1998,Tibetia-himalaica,29.103056,99.631944
5,Fabaceae,Tibetia\n himalaica,37,56507,2006,Tibetia-himalaica,29.703333,92.121389
6,Fabaceae,Tibetia\n himalaica,92,56561,2006,Tibetia-himalaica,30.119167,92.159444
7,Fabaceae,Tibetia\n himalaica,39991,59782,2007,Tibetia-himalaica,33.108611,102.631944
8,Fabaceae,Tibetia\n himalaica,41302,64870,2009,Tibetia-himalaica,31.405833,97.471944
9,Fabaceae,Tibetia\n himalaica,41510,65079,2009,Tibetia-himalaica,30.961111,98.306944


### Now search many species and filter for early-year collections

In [362]:
records = {}
for taxon in ["tricolor", "longiflora", "siphonantha", "rex", "thamnophila"]:
    records[taxon] = Hengduan("Pedicularis " + taxon)
    

In [366]:
records['tricolor'].data

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n tricolor,34238,20898,2005,31.72,100.719444
1,Orobanchaceae,Pedicularis\n tricolor,34438,21087,2005,32.581944,100.4625
2,Orobanchaceae,Pedicularis\n tricolor,34579,21240,2005,31.975556,100.593889
3,Orobanchaceae,Pedicularis\n tricolor,36572,54938,2006,31.913333,98.815


In [369]:
records["longiflora"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n longiflora,27508,391,1997,30.040278,101.838333
1,Orobanchaceae,Pedicularis\n longiflora,28270,1161,1998,28.746389,100.269167
2,Orobanchaceae,Pedicularis\n longiflora,29506,5358,2000,29.646667,98.133611
3,Orobanchaceae,Pedicularis\n longiflora,31601,16439,2004,31.644722,98.451667
4,Orobanchaceae,Pedicularis\n longiflora,33585,20243,2005,33.052778,98.003611


In [371]:
records["siphonantha"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n siphonantha\n ...,28187,1078,1998,29.133889,100.0425
1,Orobanchaceae,Pedicularis\n siphonantha,28272,1163,1998,28.746389,100.269167
2,Orobanchaceae,Pedicularis\n siphonantha,30695,15533,2004,29.1025,99.670833
3,Orobanchaceae,Pedicularis\n siphonantha,32708,19229,2005,28.96,102.103611
4,Orobanchaceae,Pedicularis\n siphonantha,33216,19821,2005,29.218889,101.516389


In [373]:
records["rex"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n rex,28416,1307,1998,29.141111,99.928333
1,Orobanchaceae,Pedicularis\n rex,30524,15367,2004,29.075833,99.918889
2,Orobanchaceae,Pedicularis\n rex,32648,19303,2005,28.940278,102.103056
3,Orobanchaceae,Pedicularis\n rex,32831,19418,2005,28.94,102.248889
4,Orobanchaceae,Pedicularis\n rex,32951,19533,2005,28.916389,102.216389


In [374]:
records["thamnophila"].data.head()

Unnamed: 0,family,taxon,cid,sid,year,latitude,longitude
0,Orobanchaceae,Pedicularis\n thamnophila,28967,1862,1998,29.044722,99.711389
1,Orobanchaceae,Pedicularis\n thamnophila,33136,19734,2005,29.213611,101.538611
2,Orobanchaceae,Pedicularis\n thamnophila,33413,19996,2005,29.153056,101.406944
3,Orobanchaceae,Pedicularis\n thamnophila,35236,53600,2006,27.253611,100.153056


In [None]:
data.map()