Data Wrangling Notebook for VertNet Deer Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [338]:
import pandas as pd
import re
import uuid
import numpy as np
import re

Silencing warnings that are unnecessary

In [339]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import VertNet Deer Data

In [340]:
deer = pd.read_csv("../Original_Data/ODOVIRGCLEAN.csv")

Add required GEOME columns

In [341]:
deer = deer.assign(basisOfRecord = "FossilSpecimen")
deer = deer.assign(samplingProtocol = "Unknown")
deer = deer.assign(measurementMethod = "Unknown")
deer = deer.assign(country = "Unknown")

Clean up lifestage column by separating values into ageUnit and ageValue

In [342]:
deer = deer.assign(verbatimAgeValue = deer["lifestage"])

# Regular expression to help detect ageValue
dig_reg = re.compile('\d+(\.\d+)?')

def find_unit(life_val):
    """Separates ageUnit lifestage data"""
    life_val = str(life_val).split()
    if len(life_val) == 2 :
        return "year"

def find_age(life_val):
    """Separates ageValue from lifestage data"""
    life_val = str(life_val).split()
    if dig_reg.match(life_val[0]):
        return life_val[0]
    else:
        return ""

deer["ageUnit"] = deer["lifestage"].apply(find_unit)
deer["ageValue"] = deer["lifestage"].apply(find_age)

Remove ageValue and ageUnit values from lifestage

In [343]:
juv_filter = deer["lifestage"] == "Juvenile"
deer['lifestage'][juv_filter == True] = "juvenile"
deer['lifestage'][juv_filter == False] = ""

Parsed through the eventdata column, identified year and moved year to new yearCollected column

In [344]:
# Create yearCollected column 
deer = deer.assign(yearCollected = '')

def find_year(date):
    """Finds year within eventdate cell"""
    slash = re.compile('/')
    dash = re.compile('-')
    date = str(date)
    
    if slash.findall(date):
        return date[-4:]
    elif dash.findall(date):
        return date[0:4]
    else:
        return "unknown"
    
deer["yearCollected"]  = deer["eventdate"].apply(find_year)

Select specified columns for final dataset

In [346]:
# Create column list
cols = deer.columns.tolist()

# Specify desired columns
cols = ['catalognumber',
        'collectioncode',
        'country',
        'decimallatitude',
        'decimallongitude',
        'eventdate',
        'institutioncode',
        'verbatimAgeValue',
        'ageValue',
        'ageUnit',
        'locality',
        'sex',
        'scientificname',
        'yearCollected',
        'basisOfRecord',
        'samplingProtocol',
        'measurementMethod',
        '1st_body_mass',
        '1st_hind_foot_length',
        '1st_tail_length',
        '1st_total_length',
        '1st_ear_length']

# Subset dataframe
deer = deer[cols]

Matching template and column terms

In [347]:
# Renaming columns 
deer = deer.rename(columns = {'catalognumber':'catalogNumber', 
                             'collectioncode':'collectionCode',
                             'decimallatitude':'decimalLatitude',
                             'decimallongitude':'decimalLongitude',
                             'eventdate':'verbatimEventDate',
                             'institutioncode' :'institutionCode',
                             'locality':'verbatimLocality',
                             'scientificname':'scientificName'})

In [348]:
# Renaming columns
deer = deer.rename(columns = {'1st_body_mass':'body mass',
                             '1st_hind_foot_length':'pes length',
                             '1st_tail_length':'tail length',
                             '1st_total_length':'body length with tail',
                             '1st_ear_length':'ear length to notch'})

Create new column individualID that has a unique identifer (e.g., collectionCode, insitutionCode, catalogNumber)

In [349]:
deer = deer.assign(individualID = deer['collectionCode'] + deer['institutionCode'] + deer['catalogNumber'])

Create new column basisOfRecord and populate with "preservedSpecimen"

In [350]:
deer = deer.assign(basisOfRecord = 'PreservedSpecimen')

Create new column locality and set to unknown

In [351]:
deer = deer.assign(locality = "Unknown")

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). Create necessary eventID column and populate with materialSampleID column

In [352]:
deer = deer.assign(materialSampleID = '')
deer['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(deer.index))]

deer = deer.assign(eventID = deer["materialSampleID"])

Create a long version of the data frame

In [353]:
# Creating long version, first specifiying keep variables, then naming variable and value
longVers = pd.melt(deer, 
                id_vars = ['catalogNumber',
                           'individualID',
                           'collectionCode',
                           'country',
                           'decimalLatitude',
                           'decimalLongitude', 
                           'verbatimEventDate', 
                           'institutionCode',
                           'verbatimAgeValue',
                           'ageValue',
                           'ageUnit',
                           'verbatimLocality',
                           'locality',
                           'sex',
                           'scientificName',
                           'yearCollected',
                           'basisOfRecord',
                           'materialSampleID',
                           'eventID',
                           'measurementMethod',
                           'samplingProtocol'], 
                            var_name = 'measurementType', 
                            value_name = 'measurementValue')


Populating measurementUnit column with appropriate measurement units in long version

In [354]:
longVers = longVers.assign(measurementUnit = "")

#Create filters
long_body_mass_filter = longVers['measurementType'] == "body mass"
long_no_body_filter = longVers['measurementType']!= "body mass"

#Assign units using filters
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"

Create diagnosticID which is a unique number for each measurement

In [355]:
longVers = longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

If measurement value equals N/a, delete entire row

In [356]:
longVers = longVers.dropna(subset = ['measurementValue'])

Writing long data csv file

In [357]:
longVers.to_csv('../Mapped_Data/FuTRES_Deer_VertNet_Global_Modern.csv', index = False)