Data Wrangling Notebook for VertNet Bats Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [85]:
import pandas as pd
import numpy as np
import multiprocessing
import re
import uuid 

Silencing warnings that are unnecessary

In [86]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Vertnet Bats Data

In [87]:
df = pd.read_csv("./../Original_Data/bats_2020-08-11b.csv")

Clean up lifeStage

In [88]:
# Fill in NA
df["lifestage_cor"] = df['lifestage_cor'].fillna("Not Collected")

# Create Filters
adult = df['lifestage_cor']=="Adult"
juvenile = df['lifestage_cor']=="Juvenile"
ns = df['lifestage_cor']=="NS"

# Assign correct terms using filters
df['lifestage_cor'][adult] = "adult"
df['lifestage_cor'][juvenile] = "juvenile"
df['lifestage_cor'][ns] = "Not Collected"

Clean up country column TODO: Need to run with GEOME Validation

In [89]:
#Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

# Create dictionary after inital GEOME run (create csv file first as a dictionary)
df["country"].unique()

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

#df['country'] = df['country'].apply(country_correction)

Create yearCollected column TODO: Create varbatim filter after validation

In [90]:
# Filling N/As with "Unknown"
df["eventdate"] = df["eventdate"].fillna("Unknown")

# Create yearCollected Column
#df = df.assign(yearCollected = '')

# Creating event date variable
#verbatim_date = df['eventdate']

# Establishing vertnet filter
#vertnet_date_filter = verbatim_date.str.contains("""""")

# Grabbing clean data
#verbatim_date_clean= verbatim_date[vertnet_date_filter==False]


# Captures year within string
def year_search(year):
    """Search string for 4 digit number and pass to correct function"""
    if (re.search(r'\d{4}$', year)):
        return year_cleaner_front(year)
    elif (re.search(r'^\d{4}', year)):
        return year_cleaner_back(year)

def year_cleaner_front(year):
    """Isolate the year at the beginning of the string"""
    cleaned_year = year[len(year)-4:len(year)]
    return cleaned_year

def year_cleaner_back(year):
    """Isolate the year at the end of the string"""
    cleaned_year = year[0:4]
    return cleaned_year

df["yearCollected"] = df["eventdate"].apply(year_search)
df["yearCollected"] = df["yearCollected"].fillna("Unknown")

df = df.assign(verbatimEventDate = df['verbatimeventdate'])

Cleaning reproductivecondition column

In [91]:
df["reproductivecondition"].unique()

# TODO: how to handle things like "T=4.1 x 4.1 mm"

norep_filter = df["reproductivecondition"].str.contains("""non-reproductive|not lactating|no lact.""")
preg_filter = df["reproductivecondition"].str.contains("""parous|1 emb|1 emb X 1 mm""")

df["reproductivecondition"][norep_filter == True] = "non-reproductive"
df["reproductivecondition"][preg_filter == True] = "pregnant"
df["reproductivecondition"][(preg_filter == False) & (norep_filter == False)] = ""

Clean sex column 

In [92]:
# Clean up sex column 
female = df['sex'] == "female"
male = df['sex'] == "male"
df['sex'][(female == False) & (male==False)] = ""

Adding additional required GEOME columns

In [93]:
df = df.assign(samplingProtocol = "Unknown")
df = df.assign(basisOfRecord = "PreservedSpecimen")

Correcting Country Names

In [None]:
#Append countries to verbatim locality column
df["locality"] = df["locality"] + "," + df["country"]

#Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

country_dictionary = {"U S A":"USA", "United States":"USA",
                      "India, Nepal":"India",
                      "Philippine Islands":"Philippines",
                      "U.S. Virgin Islands":"Virgin Islands",
                      "Republic of South Africa":"South Africa",
                      "Commonwealth of the Northern Mariana Islands":"Northern Mariana Islands",
                      "Federated States of Micronesia":"Micronesia",
                      "ST VINCENT":"Saint Vincent and the Grenadines",
                      "ENGLAND":"United Kingdom",
                      "Trinidad & Tabago":"Trinidad and Tobago",
                      "TRINIDAD & TOBAGO":"Trinidad and Tobago",
                      "São Tomé & Principe":"Sao Tome and Principe"}
                      
                      
# 	Unapproved value(s):
#"ZOO" in column "country" not in list "country"
#"Saint Barthélemy" in column "country" not in list "country"
#"Trinidad" in column "country" not in list "country"
#"Rhodesia" in column "country" not in list "country"
#"Bonaire, Sint Eustatius and Saba" in column "country" not in list "country"
          

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

df['country'] = df['country'].apply(country_correction)

Rearrange columns so that template columns are first, followed by measurement values

In [94]:
# Create column list
cols = df.columns.tolist()

# Specify desired columns
cols = ['catalognumber',
        'collectioncode',
        'coordinateuncertaintyinmeters',
        'dynamicproperties',
        'decimallatitude',
        'decimallongitude',
        'verbatimlongitude',
        'verbatimlatitude',
        'verbatimelevation',
        'institutioncode',
        'verbatimEventDate',
        'verbatimelevation',
        'verbatimlocality',
        'maximumelevationinmeters',
        'minimumelevationinmeters',
        'reproductivecondition',
        'locality',
        'fieldnotes',
        'binomial',
        'samplingProtocol',
        'occurrenceid',
        'occurrenceremarks',
        'country',
        'sex',
        'lifestage_cor',
        'basisOfRecord',
        'yearCollected',
        'body_mass.1.value',
        'ear_length.1.value',
        'hind_foot_length.1.value',
        'tail_length.1.value',
        'total_length.1.value',
        'body_mass.1.units_inferred',
        'ear_length.1.units_inferred',
        'hind_foot_length.1.units_inferred',
        'tail_length.1.units_inferred',
        'total_length.1.units_inferred',
        'body_mass.1.estimated_value',
        'ear_length.1.estimated_value',
        'hind_foot_length.1.estimated_value',
        'tail_length.1.estimated_value',
        'total_length.1.estimated_value']

# Subset dataframe
df = df[cols]

Matching template and column terms

In [95]:
# Renaming columns 
df = df.rename(columns = {'catalognumber':'catalogNumber',
                          'collectioncode': 'collectionCode',
                          'institutioncode':'institutionCode',
                          'coordinateuncertaintyinmeters':'coordinateUncertaintyInMeters',
                          'fieldnotes':'eventRemarks',
                          'decimallongitude': 'decimalLongitude',
                          'decimallatitude':'decimalLatitude',
                          'occurrenceid':'occurrenceID',
                          'occurrenceremarks':'occurrenceRemarks',
                          'verbatimelevation':'verbatimElevation',
                          'binomial':'scientificName',
                          'reproductivecondition':'reproductiveCondition',
                          'maximumelevationinmeters':'maximumElevationInMeters',
                          'dynamicproperties':'dynamicProperties',
                          'minimumelevationinmeters':'minimumElevationInMeters',
                          'verbatimlocality':'verbatimLocality',
                          'verbatimlongitude':'verbatimLongitude',
                          'verbatimlatitude':'verbatimLatitude',
                          'lifestage_cor':'lifeStage'})

Create materialSampleID which is a UUID for each measurement

In [96]:
df = df.assign(materialSampleID = '')
df['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(df.index))]

Create eventID and populate it with materialSampleID

In [97]:
df = df.assign(eventID = df["materialSampleID"])

Add required GEOME column locality after reassigning locality to verbatimLocality

In [98]:
df = df.assign(locality="Unknown")

In [99]:
len(df["verbatimElevation"])

74678

In [100]:
len(df["catalogNumber"])

74678

Creating long version, first specifiying keep variables, then naming type and value

In [75]:
melt_cols = ['catalogNumber', 'collectionCode', 'decimalLatitude','decimalLongitude',
             'yearCollected','basisOfRecord','verbatimEventDate',
            'institutionCode','lifeStage','verbatimLocality','locality', 'eventRemarks',
            'samplingProtocol','country','sex','scientificName','dynamicProperties',
            'materialSampleID','eventID','maximumElevationInMeters', 'verbatimLongitude',
            'minimumElevationInMeters','coordinateUncertaintyInMeters','verbatimLatitude',
            'occurrenceID','occurrenceRemarks','reproductiveCondition']

melt_cols = melt_cols + method_list

melt_cols

df_long = pd.melt(df,id_vars = melt_cols, var_name = 'measurementType', value_name = 'measurementValue')

#verbatim Elevation causes some errors (array not the same length)

# longVers=pd.melt(df,id_vars=['catalogNumber',
#                          'collectionCode',
#                           'decimalLatitude',
#                           'decimalLongitude',
#                           'verbatimElevation',
#                           'yearCollected',
#                           'basisOfRecord',
#                           'verbatimEventDate',
#                           'institutionCode',
#                           'lifeStage',
#                           'verbatimLocality',
#                           'locality',
#                           'eventRemarks',
#                           'samplingProtocol',
#                           'country',
#                           'sex',
#                           'scientificName',
#                           'dynamicProperties',
#                           'materialSampleID',
#                           'maximumElevationInMeters',
#                           'minimumElevationInMeters',
#                           'coordinateUncertaintyInMeters',
#                           'occurrenceRemarks',
#                           'reproductiveCondition',
#                           'occurrenceID',
#                           'verbatimLongitude',
#                           'verbatimLatitude',
#                          'eventID'], 
#                 var_name = 'measurementType', 
#                 value_name = 'measurementValue')

Pull corresponding column value in measurement_method etc and append it to offical measurementMethod

In [76]:
df_long = df_long.assign(measurementMethod = "")

def method_add(trait,ind):
    if trait == "body_mass.1.value":
        return df_long["measurementMethod_body_mass.1"][ind]
    elif trait == "ear_length.1.value":
        return df_long["measurementMethod_ear_length.1"][ind]
    elif trait == "hind_foot_length.1.value":
        return df_long["measurementMethod_hind_foot_length.1"][ind]
    elif trait == "tail_length.1.value":
        return df_long["measurementMethod_tail_length.1"][ind]
    elif trait == "total_length.1.value":
        return df_long["measurementMethod_total_length.1"][ind]

df_long['ind'] = np.arange(len(df_long))

df_long['measurementMethod'] = df_long.apply(lambda x: method_add(x.measurementType, x.ind), axis=1)

df_long['measurementMethod'] = df_long['measurementMethod'].fillna("Extracted with Traiter")

df_long = df_long.drop(columns = method_list)

Matching trait and ontology terms

In [77]:
# Create trait dictionary 
trait_dict = {'body_mass.1.value':'body mass',
              'ear_length.1.value': 'ear length to notch',
              'hind_foot_length.1.value':'pes length',
              'tail_length.1.value':'tail length',
              'total_length.1.value':'body length'}

def trait_rename(trait): 
    """
    Renames trait names with trait dictionary
    """
    if trait in trait_dict.keys():
        return trait_dict[trait]

df_long['measurementType'] = df_long['measurementType'].apply(trait_rename)

Populating measurementUnit column with appropriate measurement units in long version

In [78]:
# Create measurementUnit column
df_long = df_long.assign(measurementUnit="")

#Create filters
long_body_mass_filter = df_long['measurementType']=="body mass"
long_no_body_filter = df_long['measurementType']!="body mass"

#Assign units using filters
df_long['measurementUnit'][long_body_mass_filter] = "g"
df_long['measurementUnit'][long_no_body_filter] = "mm"

Create diagnosticID which is a unique number for each measurement

In [79]:
df_long = df_long.assign(diagnosticID = '')
df_long['diagnosticID'] = np.arange(len(df_long))

If measurement value equals N/A, delete entire row. Drop range values. 

In [80]:
#Drop N/A
df_long = df_long.dropna(subset=['measurementValue'])

In [81]:
len(df_long)

233339

Breaking up the data into more managable sizes for validation and DE storage

In [82]:
# Create chunks list
chunks = []

# Separating files into chunks of ~50,000
chunks = np.array_split(df_long, 4)

Creating data chunks

In [83]:
for i in range(len(chunks)):
    new=i+1
    chunks[i].to_csv('../Mapped_Data/FuTRES_Bats_VertNet_Global_Modern_'+ str(new) +'.csv', index=False)
    print("mapped_data",i, " done")

mapped_data 0  done
mapped_data 1  done
mapped_data 2  done
mapped_data 3  done


In [84]:
df_long

Unnamed: 0,catalogNumber,collectionCode,decimalLatitude,decimalLongitude,yearCollected,basisOfRecord,verbatimEventDate,institutionCode,lifeStage,verbatimLocality,...,verbatimLatitude,occurrenceID,occurrenceRemarks,reproductiveCondition,measurementType,measurementValue,measurementMethod,ind,measurementUnit,diagnosticID
18,18687,Mammals,7.12972,-73.12583,Unknown,PreservedSpecimen,6-Sep-60,LACM,Not Collected,Colombia | Santander Dept | Bucaramanga,...,07 07 47 N,b98b7c7d-92e8-4cf4-a873-b5630b2cc708,,,,1018,Extracted with Traiter,18,mm,18
29,2335,Mammals,,,Unknown,PreservedSpecimen,1/28/67,OMNH,Not Collected,,...,,urn:catalog:OMNH:Mammals:2335,,,,800 ft,Extracted with Traiter,29,mm,29
30,18914,Mammals,,,Unknown,PreservedSpecimen,1/1/76,OMNH,Not Collected,,...,,urn:catalog:OMNH:Mammals:18914,,,,1600 m,Extracted with Traiter,30,mm,30
441,70886,Mammals,,,Unknown,PreservedSpecimen,6-May-75,LACM,Not Collected,Mexico | Colima | Cerro Grande,...,,2752cee3-6c8b-479c-9dd4-cf56bd548459,,,,2377,Extracted with Traiter,441,mm,441
442,56081,Mammals,,,Unknown,PreservedSpecimen,7-May-75,LACM,Not Collected,Mexico | Colima | Cerro Grande,...,,7fd7ad74-f6e2-4742-b16d-beb6c19b5e32,,,,2377,Extracted with Traiter,442,mm,442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522733,267,Mamíferos,15.10140,-90.31860,Unknown,PreservedSpecimen,24-Jun-78,USAC,Not Collected,"GUATEMALA | Baja Verapaz | Salamá | 0.6 km N, ...",...,15.1014°N,urn:catalog:USAC:Mamíferos:267,,,body length,61.0,Extracted with Traiter,522733,mm,522733
522734,1541,Mamíferos,15.40530,-91.96250,Unknown,PreservedSpecimen,25-Aug-03,USAC,Not Collected,GUATEMALA | Huehuetenango | Cuilco | Sosí Chiq...,...,15.4053°N,urn:catalog:USAC:Mamíferos:1541,,,body length,94.0,Extracted with Traiter,522734,mm,522734
522735,3345,Mamíferos,17.69640,-89.53810,Unknown,PreservedSpecimen,15-Jun-04,USAC,Not Collected,GUATEMALA | Petén | Flores | Biotopo Dos Lagun...,...,17.6964°N,urn:catalog:USAC:Mamíferos:3345,,,body length,63.0,Extracted with Traiter,522735,mm,522735
522736,2943,Mamíferos,17.69640,-89.53810,Unknown,PreservedSpecimen,15-Jun-04,USAC,Not Collected,GUATEMALA | Petén | Flores | Biotopo Dos Lagun...,...,17.6964°N,urn:catalog:USAC:Mamíferos:2943,,,body length,63.0,Extracted with Traiter,522736,mm,522736
