Data Wrangling Notebook for VertNet Bats Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import re
import uuid 

Silencing warnings that are unnecessary

In [2]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Vertnet Bats Data

In [3]:
bats_df = pd.read_csv("./../Original_Data/bats_2020-08-11b.csv")

Notes

In [28]:
scientificName
catalogNumber
collectionCode
continent
coordinateprecision
coordinateuncertaintyinmeters
country
countrycode
county
decimalLatitude
decimalLongitude
dynamicproperties
eventdate
fieldnotes
habitat
institutionCode
verbatimLocality
locationaccordingto
locationremarks
maximumElevationInMeters
minimumElevationInMeters
municipality
occurrenceid
occurrenceremarks
recordedby
reproductivecondition
scientificname
sex
verbatimcoordinates
verbatimcoordinatesystem
verbatimelevation
verbatimeventdate
verbatimlatitude
verbatimlocality
verbatimlongitude
verbatimsrs
waterbody
lifeStage
samplingProtocol
measurementMethod
basisOfRecord
verbatimEventDate

scientificName
body_mass.1.estimated_value
body_mass.1.is_shorthand
body_mass.1.location
body_mass.1.units
body_mass.1.units_inferred
body_mass.1.value
body_mass.2.estimated_value
body_mass.2.is_shorthand
body_mass.2.location
body_mass.2.units
body_mass.2.units_inferred
body_mass.2.value
body_mass.3.is_shorthand
body_mass.3.location
body_mass.3.units
body_mass.3.units_inferred
body_mass.3.value
catalogNumber
collectionCode
continent
coordinateprecision
coordinateuncertaintyinmeters
country
countrycode
county
decimalLatitude
decimalLongitude
dynamicproperties
ear_length.1.ambiguous_key
ear_length.1.estimated_value
ear_length.1.is_shorthand
ear_length.1.location
ear_length.1.measured_from
ear_length.1.units
ear_length.1.units_inferred
ear_length.1.value
ear_length.2.ambiguous_key
ear_length.2.is_shorthand
ear_length.2.location
ear_length.2.measured_from
ear_length.2.units
ear_length.2.units_inferred
ear_length.2.value
ear_length.3.is_shorthand
ear_length.3.location
ear_length.3.units
ear

Clean up country column 

In [11]:
#Append countries to verbatim locality column
bats_df["verbatimLocality"] = bats_df["locality"] + "," + bats_df["country"]

#Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

# Create dictionary after inital GEOME run (create csv file first as a dictionary)
#bats_df["country"].unique()

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

#bats_df['country'] = bats_df['country'].apply(country_correction)

Create verbatimEventDate column

In [25]:
bats_df = bats_df.assign(verbatimEventDate = '')
bats_df['verbatimEventDate'] = bats_df['eventdate']

Clean yearCollected column (TODO:Needs to be standardized first)

In [None]:
# Filling N/As with "Unknown"
bats_df["eventdate"] = bats_df["eventdate"].fillna("Unknown")

# Creating event date variable
verbatim_date = bats_df['eventdate']

# Establishing vertnet filter TODO:see how relevant this filter is to this set
vertnet_date_filter = verbatim_date.str.contains("""IV|0000|September|<|NW|latter|unknown|(MCZ)|(MSU)|present|
                                                    and|;|&|mainly|between|Between|BETWEEN|OR|Unknown|UNKNOWN|
                                                    #|TO|\?|\'|----|19--|No Date|\,|\d{4}-\d{4}|(/n) /d|\d{4}[s]|
                                                    \d{4}\'[S]|1075-07-29|975-07-17|2088|9999|0201|1197|
                                                    1260|4560|1024|1119|1192|1072|1186""")

# Grabbing clean data
verbatim_date_clean= verbatim_date[vertnet_date_filter==False]


# Captures year within string
def year_search(year):
    """Search string for 4 digit number and pass to correct function"""
    if (re.search(r'\d{4}$', year)):
        return year_cleaner_front(year)
    elif (re.search(r'^\d{4}', year)):
        return year_cleaner_back(year)

def year_cleaner_front(year):
    """Isolate the year at the beginning of the string"""
    cleaned_year = year[len(year)-4:len(year)]
    return cleaned_year

def year_cleaner_back(year):
    """Isolate the year at the end of the string"""
    cleaned_year = year[0:4]
    return cleaned_year

bats_df = bats_df.assign(yearCollected = verbatim_date_clean.apply(year_search))


bats_df["yearCollected"] = bats_df["yearCollected"].fillna("Unknown")

Clean up lifestage column

In [12]:
bats_df["lifestage"].unique()

# TODO: Confirm all these are correct
# TODO: preserve information with a verbatim column 
# TODO: figure out what "fr" means
# TODO: how do we handle "?". Put in unknown?
# TODO: how de we handle things like "adult; embryo"
# TODO: how to handle thinds like "subadult/adult"
# TODO: how to handle "young of year"
# TODO: how to handle "larvae"

adult_filter = bats_df["lifestage"].str.contains("""adult|Adult|ADULT|AD - T|87|adullt|
                                                    mature|Ad|3 adult|adult.|Adult.
                                                    AD - T.A|adulture""")

subadult_filter = bats_df["lifestage"].str.contains("""subadult|SUBADULT|subadult?|
                                                       Sub-adult""")

juv_filter = bats_df["lifestage"].str.contains("""juvenile|Juvenile|JUVENILE|Juv| 
                                                  juvenile; 23 day| 52 hours old |
                                                  juvenile ?|juv.|JUVELINE|juveniles""")

na_filter = bats_df["lifestage"].str.contains("""nan|unknown|U|undetermined|n|not recorded|
                                                 other|N.P.""")

imm_filter = bats_df["lifestage"].str.contains("""embryo|neonate|Immature|immature|
                                                  IMMATURE|fetus|NEONATE|yearlings""")

array(['adult', nan, 'young', 'juvenile', 'subadult', 'unknown', 'U',
       'Adult', 'YOUNG', 'ADULT', 'undetermined', 'SUBADULT', 'Juvenile',
       'embryo', 'AD - T.', 'neonate', 'fr', 'Young', 'Immature',
       'mature', 'adullt', 'immature', 'JUVENILE', 'Juv', 'subadult?',
       'Unknown', 'Subadult', 'young adult', '87', 'juvenile; 23 day',
       'n', 'juv', '52 hours old', 'juvenile ?', 'not recorded',
       'IMMATURE', 'adult; embryo', 'Fr', 'juv.', 'Sub-adult', 'YG', 'Ad',
       'fetus', 'Adult.', 'adult.', '3 adult', 'AD - T.A', 'JUVELINE',
       'NEONATE', 'adulture', 'other', 'subadult/adult', 'juveniles',
       'young of year', 'N.P.', 'adult; foetus', 'Imm', 'larvae',
       'yearlings'], dtype=object)

Cleaning reproductivecondition column

In [6]:
bats_df["reproductivecondition"].unique()

# TODO: check these
# TODO: how to handle things like "T=4.1 x 4.1 mm"

norep_filter = bats_df["reproductivecondition"].str.contains("""non-reproductive|not lactating|
                                                                no lact.""")
preg_filter = bats_df["reproductivecondition"].str.contains("""parous|1 emb|1 emb X 1 mm""")
na_filter = bats_df["reproductivecondition"].str.contains("""nan|...|""")


array(['not pregnant; not lactating', 'parous', nan, ...,
       '1 emb, 1L X 4 mm', 'T=4.1 x 4.1 mm', '1 emb X 1 mm, no lact.'],
      dtype=object)

Clean sex column 

In [20]:
bats_df["sex"].unique()

sex_filter = bats_df["sex"].str.contains("""undetermined|nan|f?|x|in question
                                        | in question|not determined|""")


Adding additional required GEOME columns

In [23]:
bats_df = bats_df.assign(samplingProtocol="Unknown")
bats_df = bats_df.assign(measurementMethod="Unknown")
bats_df = bats_df.assign(basisOfRecord="PreservedSpecimen")

Rearrange columns so that template columns are first, followed by measurement values

In [None]:
# Create column list
cols = bats_df.columns.tolist()

# Specify desired columns
cols = ['catalognumber',
        'collectioncode',
        'decimallatitude',
        'decimallongitude',
        'verbatimElevation',
        'institutioncode',
        'verbatimEventDate',
        'locality',
        'samplingProtocol',
        'measurementMethod',
        'country',
        'sex',
        'lifestage_cor',
        'binomial',
        'basisOfRecord',
        'yearCollected',
        'body_mass.1.value',
        'ear_length.1.value',
        'hind_foot_length.1.value',
        'tail_length.1.value',
        'total_length.1.value']

# Subset dataframe
bats_df = bats_df[cols]

Matching template and column terms

In [26]:
# Renaming columns 
bats_df = bats_df.rename(columns = {'catalognumber': 'catalogNumber',
                                 'collectioncode':'collectionCode',
                                 'decimallatitude':'decimalLatitude',
                                 'decimallongitude':'decimalLongitude',
                                 'maximumelevationinmeters':'maximumElevationInMeters',
                                 'minimumelevationinmeters':'minimumElevationInMeters',
                                 'institutioncode' :'institutionCode',
                                 'locality':'verbatimLocality',
                                 'lifestage_cor':'lifeStage',
                                 'binomial':'scientificName'})

Matching trait and ontology terms

In [None]:
# Renaming columns
bats_df = bats_df.rename(columns={'body_mass.1.value':'body mass',
                                'ear_length.1.value': 'ear length to notch',
                                'hind_foot_length.1.value':'pes length',
                                'tail_length.1.value':'tail length',
                                'total_length.1.value':'body length'})

Create materialSampleID which is a UUID for each measurement

In [None]:
bats_df = bats_df.assign(materialSampleID = '')
bats_df['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(bats_df.index))]

Create eventID and populate it with materialSampleID

In [None]:
bats_df = bats_df.assign(eventID = bats_df["materialSampleID"])

Add required GEOME column locality after reassigning locality to verbatimLocality

In [None]:
bats_df = bats_df.assign(locality="Unknown")

Creating long version, first specifiying keep variables, then naming type and value

In [None]:
      longVers=pd.melt(bats_df,
                      id_vars=['catalogNumber',
                      'collectionCode',
                      'decimalLatitude',
                      'decimalLongitude',
                      'verbatimElevation',
                      'yearCollected',
                      'basisOfRecord',
                      'verbatimEventDate',
                      'institutionCode',
                      'lifeStage',
                      'verbatimLocality',
                      'locality',
                      'samplingProtocol',
                      'measurementMethod',
                      'country',
                      'sex',
                      'scientificName',
                      'materialSampleID',
                      'eventID'], 
                var_name = 'measurementType',
                value_name = 'measurementValue')

Populating measurementUnit column with appropriate measurement units in long version

In [None]:
# Create measurementUnit column
longVers = longVers.assign(measurementUnit="")

#Create filters
long_body_mass_filter=longVers['measurementType']=="body mass"
long_no_body_filter=longVers['measurementType']!="body mass"

#Assign units using filters
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"

Create diagnosticID which is a unique number for each measurement

In [None]:
longVers = longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

If measurement value equals N/A, delete entire row. Drop range values. 

In [None]:
#Drop N/A
longVers = longVers.dropna(subset=['measurementValue'])