Data Wrangling Notebook for VertNet Bats Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [1]:
import pandas as pd
import numpy as np
import multiprocessing
import re
import uuid 

Silencing warnings that are unnecessary

In [2]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Vertnet Bats Data

In [3]:
bats_df = pd.read_csv("./../Original_Data/bats_2020-08-11b.csv")

Clean up country column 

In [11]:
#Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

# Create dictionary after inital GEOME run (create csv file first as a dictionary)
#bats_df["country"].unique()

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

#bats_df['country'] = bats_df['country'].apply(country_correction)

Create verbatimEventDate column

In [25]:

verbatimeventdate

verbatimEventDate


bats_df = bats_df.assign(verbatimEventDate = '')
bats_df['verbatimEventDate'] = bats_df['eventdate']

Clean yearCollected column (TODO:Needs to be standardized first)

In [None]:
# Filling N/As with "Unknown"
bats_df["eventdate"] = bats_df["eventdate"].fillna("Unknown")

# Creating event date variable
verbatim_date = bats_df['eventdate']

# Establishing vertnet filter TODO:see how relevant this filter is to this set
vertnet_date_filter = verbatim_date.str.contains("""IV|0000|September|<|NW|latter|unknown|(MCZ)|(MSU)|present|
                                                    and|;|&|mainly|between|Between|BETWEEN|OR|Unknown|UNKNOWN|
                                                    #|TO|\?|\'|----|19--|No Date|\,|\d{4}-\d{4}|(/n) /d|\d{4}[s]|
                                                    \d{4}\'[S]|1075-07-29|975-07-17|2088|9999|0201|1197|
                                                    1260|4560|1024|1119|1192|1072|1186""")

# Grabbing clean data
verbatim_date_clean= verbatim_date[vertnet_date_filter==False]


# Captures year within string
def year_search(year):
    """Search string for 4 digit number and pass to correct function"""
    if (re.search(r'\d{4}$', year)):
        return year_cleaner_front(year)
    elif (re.search(r'^\d{4}', year)):
        return year_cleaner_back(year)

def year_cleaner_front(year):
    """Isolate the year at the beginning of the string"""
    cleaned_year = year[len(year)-4:len(year)]
    return cleaned_year

def year_cleaner_back(year):
    """Isolate the year at the end of the string"""
    cleaned_year = year[0:4]
    return cleaned_year

bats_df = bats_df.assign(yearCollected = verbatim_date_clean.apply(year_search))


bats_df["yearCollected"] = bats_df["yearCollected"].fillna("Unknown")

Clean up lifestage column

In [6]:
# Fill in NA
bats_df["lifestage_cor"] = bats_df['lifestage_cor'].fillna("Not Collected")

# Create Filters
adult = bats_df['lifestage_cor']=="Adult"
juvenile = bats_df['lifestage_cor']=="Juvenile"
ns = bats_df['lifestage_cor']=="NS"

# Assign correct terms using filters
bats_df['lifestage_cor'][adult] = "adult"
bats_df['lifestage_cor'][juvenile] = "juvenile"
bats_df['lifestage_cor'][ns] = "Not Collected"

array(['Adult', nan, 'Juvenile', 'NS'], dtype=object)

Cleaning reproductivecondition column

In [6]:
bats_df["reproductivecondition"].unique()

# TODO: check these
# TODO: how to handle things like "T=4.1 x 4.1 mm"

norep_filter = bats_df["reproductivecondition"].str.contains("""non-reproductive|not lactating|
                                                                no lact.""")
preg_filter = bats_df["reproductivecondition"].str.contains("""parous|1 emb|1 emb X 1 mm""")
na_filter = bats_df["reproductivecondition"].str.contains("""nan|...|""")


array(['not pregnant; not lactating', 'parous', nan, ...,
       '1 emb, 1L X 4 mm', 'T=4.1 x 4.1 mm', '1 emb X 1 mm, no lact.'],
      dtype=object)

Clean sex column 

In [20]:
bats_df["sex"].unique()

sex_filter = bats_df["sex"].str.contains("""undetermined|nan|f?|x|in question
                                        | in question|not determined|""")


Adding additional required GEOME columns

In [23]:
bats_df = bats_df.assign(samplingProtocol="Unknown")
bats_df = bats_df.assign(measurementMethod="Unknown")
bats_df = bats_df.assign(basisOfRecord="PreservedSpecimen")

Rearrange columns so that template columns are first, followed by measurement values

In [None]:
# Create column list
cols = bats_df.columns.tolist()

# Specify desired columns
cols = ['catalogNumber',
        'collectionCode',
        'coordinateuncertaintyinmeters',
        'dynamicproperties',
        'decimalLatitude',
        'decimalLongitude',
        'verbatimElevation',
        'institutionCode',
        'verbatimEventDate',
        'verbatimelevation',
        'verbatimLocality',
        'maximumElevationInMeters',
        'minimumElevationInMeters',
        'reproductivecondition',
        'locality',
        'fieldnotes'
        'scientificName',
        'samplingProtocol',
        'occurrenceid',
        'measurementMethod',
        'country',
        'sex',
        'lifestage_cor',
        'basisOfRecord',
        'yearCollected',
        'body_mass.1.value',
        'ear_length.1.value',
        'hind_foot_length.1.value',
        'tail_length.1.value',
        'total_length.1.value']

# Subset dataframe
bats_df = bats_df[cols]

Matching template and column terms

In [26]:
# Renaming columns 
bats_df = bats_df.rename(columns = {'coordinateuncertaintyinmeters':'coordinateUncertaintyInMeters',
                                    'fieldnotes':'eventRemarks',
                                    'occurrenceid':'OccurenceID',
                                    'verbatimelevation':'verbatimElevation',
                                    'reproductivecondition':'reproductiveCondition',
                                    'maximumelevationinmeters':'maximumElevationInMeters',
                                    'dynamicproperties':'dynamicProperties',
                                    'minimumelevationinmeters':'minimumElevationInMeters',
                                    'locality':'verbatimLocality',
                                    'lifestage_cor':'lifeStage'})

Matching trait and ontology terms

In [None]:
# Renaming columns
bats_df = bats_df.rename(columns={'body_mass.1.value':'body mass',
                                'ear_length.1.value': 'ear length to notch',
                                'hind_foot_length.1.value':'pes length',
                                'tail_length.1.value':'tail length',
                                'total_length.1.value':'body length'})

Create materialSampleID which is a UUID for each measurement

In [None]:
bats_df = bats_df.assign(materialSampleID = '')
bats_df['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(bats_df.index))]

Create eventID and populate it with materialSampleID

In [None]:
bats_df = bats_df.assign(eventID = bats_df["materialSampleID"])

Add required GEOME column locality after reassigning locality to verbatimLocality

In [None]:
bats_df = bats_df.assign(locality="Unknown")

Creating long version, first specifiying keep variables, then naming type and value

In [None]:
      longVers=pd.melt(bats_df,
                      id_vars=['catalogNumber',
                      'collectionCode',
                      'decimalLatitude',
                      'decimalLongitude',
                      'maximumElevationInMeters',
                      'minimumElevationInMeters',
                      'verbatimElevation',
                      'yearCollected',
                      'basisOfRecord',
                      'verbatimEventDate',
                      'institutionCode',
                      'lifeStage',
                      'verbatimLocality',
                      'locality',
                      'samplingProtocol',
                      'measurementMethod',
                      'country',
                      'sex',
                      'scientificName',
                      'materialSampleID',
                      'eventID'], 
                var_name = 'measurementType',
                value_name = 'measurementValue')

Populating measurementUnit column with appropriate measurement units in long version

In [None]:
# Create measurementUnit column
longVers = longVers.assign(measurementUnit="")

#Create filters
long_body_mass_filter=longVers['measurementType']=="body mass"
long_no_body_filter=longVers['measurementType']!="body mass"

#Assign units using filters
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"

Create diagnosticID which is a unique number for each measurement

In [None]:
longVers = longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

If measurement value equals N/A, delete entire row. Drop range values. 

In [None]:
#Drop N/A
longVers = longVers.dropna(subset=['measurementValue'])