Data Wrangling Notebook for VertNet Mammal Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [179]:
import pandas as pd
import numpy as np
import multiprocessing
import re
import uuid 

Silencing warnings that are unnecessary

In [180]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Vertnet Mammal Data

In [181]:
mammal = pd.read_csv("./../Original_Data/no_bats_2020-08-12b.csv")

Clean yearCollected column

In [182]:
# Filling N/As with "Unknown"
mammal["eventdate"] = mammal["eventdate"].fillna("Unknown")

# Create yearCollected Column
mammal = mammal.assign(yearCollected = '')

# Creating event date variable
verbatim_date = mammal['eventdate']

# Establishing vertnet filter
vertnet_date_filter = verbatim_date.str.contains("""IV|0000|September|<|NW|latter|unknown|(MCZ)|(MSU)|present|
                                                    and|;|&|mainly|between|Between|BETWEEN|OR|Unknown|UNKNOWN|
                                                    #|TO|\?|\'|----|19--|No Date|\,|\d{4}-\d{4}|(/n) /d|\d{4}[s]|
                                                    \d{4}\'[S]|1075-07-29|975-07-17|2088|9999|0201|1197|
                                                    1260|4560|1024|1119|1192|1072|1186""")

# Grabbing clean data
verbatim_date_clean= verbatim_date[vertnet_date_filter==False]


# Captures year within string
def year_search(year):
    """Search string for 4 digit number and pass to correct function"""
    if (re.search(r'\d{4}$', year)):
        return year_cleaner_front(year)
    elif (re.search(r'^\d{4}', year)):
        return year_cleaner_back(year)

def year_cleaner_front(year):
    """Isolate the year at the beginning of the string"""
    cleaned_year = year[len(year)-4:len(year)]
    return cleaned_year

def year_cleaner_back(year):
    """Isolate the year at the end of the string"""
    cleaned_year = year[0:4]
    return cleaned_year

mammal["yearCollected"] = verbatim_date_clean.apply(year_search)
mammal["yearCollected"] = mammal["yearCollected"].fillna("Unknown")


Clean up lifeStage

In [183]:
# Fill in NA
mammal["lifestage_cor"] = mammal['lifestage_cor'].fillna("Not Collected")

# Create Filters
adult = mammal['lifestage_cor']=="Adult"
juvenile = mammal['lifestage_cor']=="Juvenile"
ns = mammal['lifestage_cor']=="NS"

# Assign correct terms using filters
mammal['lifestage_cor'][adult] = "adult"
mammal['lifestage_cor'][juvenile] = "juvenile"
mammal['lifestage_cor'][ns] = "Not Collected"

Clean up sex column

In [184]:
# Clean up sex column 
female = mammal['sex']=="female"
male = mammal['sex']=="male"
mammal['sex'][(female == False)&(male==False)]=""

Fill Scientific Names with unknown 

In [185]:
mammal["binomial"]=mammal["binomial"].fillna("Unknown")

Adding additional required GEOME columns

In [186]:
mammal=mammal.assign(samplingProtocol="Unknown")
mammal=mammal.assign(basisOfRecord="PreservedSpecimen")

Create verbatimEventDate column

In [187]:
mammal=mammal.assign(verbatimEventDate = '')
mammal['verbatimEventDate']=mammal['eventdate']

Clean up country column 

In [188]:
#Append countries to verbatim locality column
mammal["locality"] = mammal["locality"] + "," + mammal["country"]

#Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

country_dictionary = {"United States":"USA", "U S A":"USA", 
                      "Philippine Islands":"Philippines",
                      "Indonesia; Borneo":"Indonesia",
                      "Malaysia; Malaya":"Malaysia",
                      "U.S. Virgin Islands":"Virgin Islands",
                      "Republic of South Africa":"South Africa",
                      "Ivory Coast":"Cote d'Ivoire",
                      "Federated States of Micronesia":"Micronesia",
                      "Lesser Antilles; Grenada":"Grenada",
                      "Indonesia; Java":"Indonesia",
                      "Lesser Antilles; Saint Vincent":"Saint Vincent and the Grenadines",
                      "Lesser Antilles; Barbados":"Barbados",
                      "ST VINCENT":"Saint Vincent and the Grenadines",
                      "Lesser Antilles; Montserrat": "Montserrat",
                      "Indonesia; Sumatra":"Indonesia",
                      "Virgin Islands, US":"Virgin Islands",
                      "Lesser Antilles; Antigua":"Antigua and Barbuda",
                      "England":"United Kingdom",
                      "Republic of Trinidad and Tobago":"Trinidad and Tobago",
                      "Trinidad And Tobago; Trinidad":"Trinidad and Tobago",
                      "COMMONWEALTH OF THE NORTHERN MARIANA ISLANDS":"Northern Mariana Islands",
                      "Congo":"Democratic Republic of the Congo",
                      "Malaysia; Sabah":"Malaysia",
                      "Lesser Antilles; Martinique":"Martinique",
                      "Republic of the Marshall Islands":"Marshall Islands",
                      "Commonwealth of the Bahamas":"Bahamas",
                      "Trinidad & Tabago":"Trinidad and Tobago",
                      "United Kingdom; England":"United Kingdom",
                      "United Kingdom; Scotland":"United Kingdom",
                      "United Kingdom; Wales":"United Kingdom",
                      "Lesser Antilles; Dominica":"Dominica",
                      "Papua, New Guinea":"Papua New Guinea",
                      "People's Republic of China":"China",
                      "SCOTLAND":"United Kingdom"}

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

mammal['country'] = mammal['country'].apply(country_correction)

Create verbatimElevation Column

In [189]:
string_max = mammal["maximumelevationinmeters"].astype(str)
string_min = mammal["minimumelevationinmeters"].astype(str)
mammal['verbatimElevation']= string_max + "," + string_min

Rearrange columns so that template columns are first, followed by measurement values

In [191]:
# Create column list
cols = mammal.columns.tolist()

# Specify desired columns
cols = ['catalognumber',
        'collectioncode',
        'decimallatitude',
        'decimallongitude',
        'verbatimElevation',
        'institutioncode',
        'verbatimEventDate',
        'occurenceremarks',
        'occurenceid',
        'verbatimlongitude',
        'verbatimlatitude',
        'locality',
        'samplingProtocol',
        'country',
        'sex',
        'lifestage_cor',
        'binomial',
        'basisOfRecord',
        'yearCollected',
        'body_mass.1.value',
        'ear_length.1.value',
        'hind_foot_length.1.value',
        'tail_length.1.value',
        'total_length.1.value', 
        'body_mass.1.units_inferred',
        'ear_length.1.units_inferred',
        'hind_foot_length.1.units_inferred',
        'tail_length.1.units_inferred',
        'total_length.1.units_inferred',
        'body_mass.1.estimated_value',
        'ear_length.1.estimated_value',
        'hind_foot_length.1.estimated_value',
        'tail_length.1.estimated_value',
        'total_length.1.estimated_value']

# Subset dataframe
mammal = mammal[cols]

Matching template and column terms

In [192]:
# Renaming columns 
mammal = mammal.rename(columns = {'catalognumber': 'catalogNumber',
                                 'collectioncode':'collectionCode',
                                 'decimallatitude':'decimalLatitude',
                                 'decimallongitude':'decimalLongitude',
                                 'maximumelevationinmeters':'maximumElevationInMeters',
                                 'minimumelevationinmeters':'minimumElevationInMeters',
                                 'institutioncode' :'institutionCode',
                                 'occurenceremarks':'occurenceRemarks',
                                 'occurenceid':'occurenceID',
                                 'verbatimlongitude':'verbatimLongitude',
                                 'verbatimlatitude':'verbatimLatitude',
                                 'locality':'verbatimLocality',
                                 'lifestage_cor':'lifeStage',
                                 'binomial':'scientificName'})

Create materialSampleID which is a UUID for each measurement

In [193]:
mammal=mammal.assign(materialSampleID = '')
mammal['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(mammal.index))]

Create eventID and populate it with materialSampleID

In [194]:
mammal=mammal.assign(eventID = mammal["materialSampleID"])

Add required GEOME column locality after reassigning locality to verbatimLocality

In [195]:
mammal=mammal.assign(locality="Unknown")

Creates a unique measurementMethod column for each desired trait

In [196]:
# List of desired traits
trait_name_list = ["body_mass.1","ear_length.1","hind_foot_length.1",
                   "tail_length.1","total_length.1"]

method_list = ["measurementMethod_" + x for x in trait_name_list]
mammal = mammal.join(pd.DataFrame(index = mammal.index, columns= method_list))

def trait_method(trait):
    """
    Adds measurementMethod information based off of "True" values in inferred value
    and estimated value columns
    """
    
    column = "measurementMethod_" + trait
    
    inferred_column = trait + ".units_inferred"
    estimated_column = trait + ".estimated_value"
    
    inferred_filter = mammal[inferred_column].astype(str).str.contains("TRUE|True|true")
    estimated_filter = mammal[estimated_column].astype(str).str.contains("TRUE|True|true")
    
    mammal[column][inferred_filter] = "Extracted with Traiter ; inferred value"
    mammal[column][estimated_filter] = "Extracted with Traiter ; estimated value"
    mammal[column][estimated_filter & inferred_filter] = "Extracted with Traiter ; estimated value; inferred value"

[trait_method(x) for x in trait_name_list]

mammal = mammal.drop(columns = ['body_mass.1.units_inferred',
                'ear_length.1.units_inferred',
                'hind_foot_length.1.units_inferred',
                'tail_length.1.units_inferred',
                'total_length.1.units_inferred',
                'body_mass.1.estimated_value',
                'ear_length.1.estimated_value',
                'hind_foot_length.1.estimated_value',
                'tail_length.1.estimated_value',
                'total_length.1.estimated_value'])

Creating long version, first specifiying keep variables, then naming type and value

In [197]:
melt_cols = ['catalogNumber', 'collectionCode', 'decimalLatitude','decimalLongitude',
            'verbatimElevation','yearCollected','basisOfRecord','verbatimEventDate',
            'institutionCode','lifeStage','verbatimLocality','locality',
            'samplingProtocol','country','sex','scientificName',
            'materialSampleID','eventID']

melt_cols = melt_cols + method_list

longVersMammal = pd.melt(mammal,id_vars = melt_cols, var_name = 'measurementType', value_name = 'measurementValue')

In [198]:
longVersMammal["measurementMethod_body_mass.1"].unique()

array([nan, 'Extracted with Traiter ; inferred value',
       'Extracted with Traiter ; estimated value',
       'Extracted with Traiter ; estimated value; inferred value'],
      dtype=object)

Pull corresponding column value in measurement_method etc and append it to offical measurementMethod

In [200]:
longVersMammal = longVersMammal.assign(measurementMethod = "")

def method_add(trait,ind):
    if trait == "body_mass.1.value":
        return longVersMammal["measurementMethod_body_mass.1"][ind]
    elif trait == "ear_length.1.value":
        return longVersMammal["measurementMethod_ear_length.1"][ind]
    elif trait == "hind_foot_length.1.value":
        return longVersMammal["measurementMethod_hind_foot_length.1"][ind]
    elif trait == "tail_length.1.value":
        return longVersMammal["measurementMethod_tail_length.1"][ind]
    elif trait == "total_length.1.value":
        return longVersMammal["measurementMethod_total_length.1"][ind]

longVersMammal['ind'] = np.arange(len(longVersMammal))

longVersMammal['measurementMethod'] = longVersMammal.apply(lambda x: method_add(x.measurementType, x.ind), axis=1)

longVersMammal['measurementMethod'] = longVersMammal['measurementMethod'].fillna("Extracted with Traiter")

longVersMammal = longVersMammal.drop(columns = method_list)

Matching trait and ontology terms

In [203]:
# Create trait dictionary 
trait_dict = {'body_mass.1.value':'body mass',
              'ear_length.1.value': 'ear length to notch',
              'hind_foot_length.1.value':'pes length',
              'tail_length.1.value':'tail length',
              'total_length.1.value':'body length'}

def trait_rename(trait): 
    """
    Renames trait names with trait dictionary
    """
    if trait in trait_dict.keys():
        return trait_dict[trait]

longVersMammal['measurementType'] = longVersMammal['measurementType'].apply(trait_rename)

Populating measurementUnit column with appropriate measurement units in long version

In [204]:
# Create measurementUnit column
longVersMammal=longVersMammal.assign(measurementUnit="")

#Create filters
long_body_mass_filter=longVersMammal['measurementType']=="body mass"
long_no_body_filter=longVersMammal['measurementType']!="body mass"

#Assign units using filters
longVersMammal['measurementUnit'][long_body_mass_filter] = "g"
longVersMammal['measurementUnit'][long_no_body_filter] = "mm"


Create diagnosticID which is a unique number for each measurement

In [205]:
longVersMammal=longVersMammal.assign(diagnosticID = '')
longVersMammal['diagnosticID'] = np.arange(len(longVersMammal))

If measurement value equals N/A, delete entire row. Drop range values. 

In [206]:
#Drop N/A
longVersMammal = longVersMammal.dropna(subset=['measurementValue'])

#Drop Range Values
range_value_filter=longVersMammal['measurementValue'].str.contains(",|one", na=False)
longVersMammal['measurementValue'][range_value_filter] = float("nan")
longVersMammal = longVersMammal.dropna(subset=['measurementValue'])

Breaking up the data into more managable sizes for validation and DE storage

In [209]:
# Create chunks list
chunks = []

# Separating files into chunks of ~500,000
chunks = np.array_split(longVersMammal, 10)

Creating data chunks

In [210]:
for i in range(len(chunks)):
    new=i+1
    chunks[i].to_csv('../Mapped_Data/FuTRES_Mammals_VertNet_Global_Modern_'+ str(new) +'.csv', index=False)
    print("mapped_data",i, " done")

mapped_data 0  done
mapped_data 1  done
mapped_data 2  done
mapped_data 3  done
mapped_data 4  done
mapped_data 5  done
mapped_data 6  done
mapped_data 7  done
mapped_data 8  done
mapped_data 9  done
