# Fetch data from Biodiversity of Hengduan Mountains
Get all Pedicularis records from [Biodiversity of the Hengduan Mountains](https://www.hengduan-biodiversity.net/). We have to scrape this site because it doesn't have an API.

Creates a file called "../data/fieldnotes/bhm.net-Pedicularis.csv" containing Name, Latitude, and Longitude

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Fetch the search results for 'Pedicularis' (~2300)

In [2]:
searchurl = "https://www.hengduan-biodiversity.net/fieldnotes/specimens/search/search.zpt?st=Pedicularis&action=search&submit_button=Search"
res = requests.get(url=searchurl).text
soup = BeautifulSoup(res, 'html.parser')

In [85]:
# Get the element with 'id' for the results table
t = soup.find(id="angio_table")

# Find all table row elements and extract a few of the cells
rows = []
for tr in t.find_all("tr")[:]:
    cells = []
    for td in tr.find_all(['td', 'th']):
        # If there's a link, get both text and href
        a = td.find_all('a')
        if len(a): a = a[-1]
        else: continue
        #print(a)
        if a:
            if a.has_attr('onclick'): cells.append(a['onclick'].split("'")[1])
            else:
                try:
                    cells.append(a.text)
                    cells.append(a['href'])
                except:
                    pass
        else:
            cells.append(td.text)
    rows.append(cells)

In [90]:
# Format results as dataframe
search_results = pd.DataFrame(rows[1:], columns=['Family', 'Binomial', 'Coll. No.', 'Link'])
search_results.tail()

Unnamed: 0,Family,Binomial,Coll. No.,Link
2355,Orobanchaceae,Pedicularis verticillata,JJ128,specimen_detail.zpt?specimen_id=124703
2356,Orobanchaceae,Pedicularis verticillata,46415,specimen_detail.zpt?specimen_id=123584
2357,Orobanchaceae,Pedicularis vialii,26759,specimen_detail.zpt?specimen_id=84070
2358,Orobanchaceae,Pedicularis yunnanensis,1054,specimen_detail.zpt?specimen_id=14253
2359,Orobanchaceae,Pedicularis zayuensis,44551,specimen_detail.zpt?specimen_id=121194


## Get latlong from specimen records

Each specimen record has a `div` with id='locality' that contains a text description of the site including degree/minute/second coordinates. There are also hidden `input` tags with the decimal degree latitude/longitudes which are more convenient. Elevation needs to be parsed from the text directly.

In [None]:
## This cell takes a while to run (several minutes)
specimen_urlbase = "https://www.hengduan-biodiversity.net/fieldnotes/specimens/search/"

sp_dat = {"lat":[], "long":[], "elev":[]}

for idx in search_results.index:
    specimen_url = specimen_urlbase + search_results["Link"][idx]
    res = requests.get(url=specimen_url).text
    soup = BeautifulSoup(res, 'html.parser')
    sp_dat["lat"].append(soup.find(id="latitude").get("value"))
    sp_dat["long"].append(soup.find(id="longitude").get("value"))
    # This is hackish. Locality text is inside a <td> with this exact styling, which is the only instance on the page.
    elev = soup.find(style="padding:0.25em")
    # The elevation is always in the last line of the locality info
    sp_dat["elev"].append(elev.contents[-1])

len(sp_dat)

## Merge locality info with search results and parse/format lat/long/elev
The lat/long values can be cast to float directly. Elevation values have a bunch of text to 
parse around them (including nbsp which is the \xa0 in the first split).

In [189]:
bhm_df = pd.concat([search_results,
           pd.DataFrame(sp_dat)], axis=1)
bhm_df["lat"] = bhm_df["lat"].astype(float)
bhm_df["long"] = bhm_df["long"].astype(float)

# Clean text from edges of the elevation info
bhm_df["elev"] = bhm_df["elev"].apply(lambda x: x.split("\xa0m")[0].split(" ")[-1])
mask = bhm_df["elev"] == ''

print(f"Removing # samples w/ no latlong: {sum(mask)}")
bhm_df = bhm_df[~mask]
bhm_df["elev"] = bhm_df["elev"].astype(int)

mask = bhm_df["Binomial"].str.endswith(('sp', 'None'))
bhm_df = bhm_df[~mask]
print(f"Removed samples w/o species id: {sum(mask)}")

mask = bhm_df["Coll. No."].str.startswith(('DE', 'JJ'))
bhm_df = bhm_df[~mask]
print(f"Removed Eaton Lab samples: {sum(mask)}")

# Clean names, underscore between genus & species, and remove subspecies identifiers
bhm_df["Binomial"] = bhm_df["Binomial"].str.replace(" ", "_", 1)
bhm_df["Binomial"] = bhm_df["Binomial"].str.split(" ").str[0]
bhm_df

Removing # samples w/ no latlong: 1
Removed samples w/o species id: 208
Removed Eaton Lab samples: 858


Unnamed: 0,Family,Binomial,Coll. No.,Link,lat,long,elev
203,Orobanchaceae,Pedicularis_alaschanica,67,specimen_detail.zpt?specimen_id=9226,34.683333,100.683333,3200
204,Orobanchaceae,Pedicularis_alaschanica,301,specimen_detail.zpt?specimen_id=9461,34.676667,100.643333,3160
205,Orobanchaceae,Pedicularis_alaschanica,620,specimen_detail.zpt?specimen_id=9780,34.707778,100.244167,3500
206,Orobanchaceae,Pedicularis_alaschanica,1103,specimen_detail.zpt?specimen_id=10265,33.727778,99.350278,4040
207,Orobanchaceae,Pedicularis_alaschanica,1144,specimen_detail.zpt?specimen_id=10305,33.727778,99.350278,4040
...,...,...,...,...,...,...,...
2345,Orobanchaceae,Pedicularis_verticillata,44792,specimen_detail.zpt?specimen_id=121437,33.127222,97.488611,4700
2356,Orobanchaceae,Pedicularis_verticillata,46415,specimen_detail.zpt?specimen_id=123584,28.073333,98.760833,3950
2357,Orobanchaceae,Pedicularis_vialii,26759,specimen_detail.zpt?specimen_id=84070,27.182500,98.720278,3050
2358,Orobanchaceae,Pedicularis_yunnanensis,1054,specimen_detail.zpt?specimen_id=14253,25.700000,100.083333,3900


### Check names

In [182]:
print(sorted(set(bhm_df["Binomial"])))

['Pedicularis_alaschanica', 'Pedicularis_aloensis', 'Pedicularis_alopecuros', 'Pedicularis_anas', 'Pedicularis_anas / cheilanthifolia', 'Pedicularis_angustiloba', 'Pedicularis_anthemifolia', 'Pedicularis_armata', 'Pedicularis_axillaris', 'Pedicularis_batangensis', 'Pedicularis_bella', 'Pedicularis_bidentata', 'Pedicularis_binaria', 'Pedicularis_bomiensis', 'Pedicularis_brachycrania', 'Pedicularis_brevilabris', 'Pedicularis_cephalantha', 'Pedicularis_cernua', 'Pedicularis_cheilanthifolia', 'Pedicularis_chenocephala', 'Pedicularis_chinensis', 'Pedicularis_chumbica', 'Pedicularis_cinerascens', 'Pedicularis_comptoniifolia', 'Pedicularis_confertiflora', 'Pedicularis_cranolopha', 'Pedicularis_cristatella', 'Pedicularis_croizatiana', 'Pedicularis_cryptantha', 'Pedicularis_curvituba', 'Pedicularis_cyathophylla', 'Pedicularis_cyathophylloides', 'Pedicularis_daucifolia', 'Pedicularis_davidii', 'Pedicularis_debilis', 'Pedicularis_decora', 'Pedicularis_decorissima', 'Pedicularis_delavayi', 'Pedicu

## Dump results to csv
Rename the columns to agree with the names of the other datasets

In [190]:
bhm_df.rename(columns={"long":"Longitude", 
                        "lat":"Latitude",
                        "Binomial":"Name"}, inplace=True)

bhm_df.to_csv("../data/fieldnotes/bhm.net-Pedicularis.csv", columns=["Name", "Latitude", "Longitude"], index=False)