Data Wrangling Notebook for EAP Deer Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [3]:
import pandas as pd
import numpy as np 
import uuid
import re

Import EAP Florida Modern Deer Data

In [7]:
deerData = pd.read_csv("../Original_Data/EAP_Florida_Modern_Deer_Measurements_FORFUTRES_1_23_2020.csv")

Create GEOME required ageUnit column and populate it with "year"

In [None]:
deerData = deerData.assign(ageUnit = "year")

Create yearCollected column and populate it with extracted year from eventDate

In [None]:
deerData = deerData.assign(yearCollected = "")

def find_year(date):
    """Converts scientific name to binomial nomenclature format"""
    date = str(date)
    slash = re.compile('/')
    
    #Finds '/' in string and assigns last value to year 
        if slash.findall(date):
        date = date.split('/')
        return date[2]

# Assign year to yearCollected column
deerData["yearCollected"]  = deerData["eventDate"].apply(find_year)

Standardize Sex Column

In [4]:
female = deerData['sex']=="F"
male = deerData['sex'] == "M"
deerData['sex'][(female == False) & (male == False)] = "not collected"
deerData['sex'][female == True] = "female"
deerData['sex'][male == True] = "male"

Stadardize Side Column

In [5]:
right = deerData['side'] == "R"
left = deerData['side'] == "L"
deerData['side'][(right == False) & (left == False)] = ""
deerData['side'][right == True] = "right"
deerData['side'][left == True] = "left"

Set up verbatimAge column

In [8]:
deerData["verbatimAgeValue"] = deerData['ageValue']

Standardize Reproduction Condition column

In [10]:
reproduction_data=deerData["reproductiveCondition"]
not_pregnant_filter = reproduction_data.str.contains("not|non")
pregnant_filter = reproduction_data.str.contains("fetus|several")
space_filter = reproduction_data.str.contains("--")

# Reassigning data
deerData["reproductiveCondition"][not_pregnant_filter==True]="non-reproductive"
deerData["reproductiveCondition"][pregnant_filter==True]="pregnant"
deerData["reproductiveCondition"][space_filter==True]=""

34                     
35                     
36                     
37                     
38                     
             ...       
243                    
244    non-reproductive
245    non-reproductive
246    non-reproductive
247    non-reproductive
Name: Reproductive Condition, Length: 214, dtype: object

Standardize lifeStage column

In [11]:
young_adult = deerData['[Age] Life Stage'] == "young adult"
dash_age = deerData['[Age] Life Stage'] == "--"
deerData['[Age] Life Stage'][young_adult == True] = "adult"
deerData['[Age] Life Stage'][dash_age == True] = ""

Select specified columns for final dataset

In [13]:
#Create column list
cols = deerData.columns.tolist()

#Specify desired columns
cols = ['occurenceid',
        'references',
        'scientificName',
        'catalogNumber',
        'sex',
        'ageValue',
        'lifeStage',
        'occurrenceRemarks',
        'ageUnit',
        'country',
        'stateProvince',
        'locality',
        'verbatimLocality',       
        'decimalLatitude',
        'decimalLongitude',
        'verbatimEventDate',
        'yearCollected',
        'samplingProtocol',
        'reproductiveCondition',
        'eventRemarks',
        
        'Side',
        'Total Fresh Weight (g)',
        'Height (mm) [define?]',
        'TL (mm) [Total Length]',
        'HF (mm) [Hind Foot Length]',
        'Measurement Remarks',
        'Measurements by',
        'Measurement Date',
        'Measurement Method',
        'Measurement Accuracy',
        'otherCatalogNumbers']

#Subset dataframe
deerData = deerData[cols]

In [14]:
#Matching template and column terms

#Renaming columns 
deerData = deerData.rename(columns = {'occurenceid':'occurrenceID',
                                      'ageValue':'verbatimAgeValue'
                                      'occurrenceRemarks':'ageEstimationMethod',
                                      'side': 'measurementSide',
                                      'Measurement Remarks': 'measurementRemarks',
                                      'Measurements by': 'measurementDeterminedBy',
                                      'Measurement Date': 'measurementDeterminedDate',
                                      'Measurement Method': 'measurementMethod',
                                      'Measurement Accuracy': 'measurementAccuracy'})

In [15]:
#Matching trait and ontology terms

#Renaming columns
deerData = deerData.rename(columns={'Total Fresh Weight (g)': 'body mass',
                                    'Height (mm) [define?]': 'body height',
                                    'TL (mm) [Total Length]': 'body length',
                                    'HF (mm) [Hind Foot Length]': 'pes length'})


In [16]:
#Create materialSampleID which is a UUID for each measurement
#Create eventID and populate it with materialSampleID

deerData=deerData.assign(materialSampleID = '')
deerData['materialSampleID'] = [uuid.uuid4() for _ in range(len(deerData.index))]

for ind in deerData.index:
    x=deerData['materialSampleID'][ind]
    y=str(x)
    z=y.replace("-", '_')
    
    deerData['materialSampleID'][ind] = z

deerData=deerData.assign(eventID = deerData["materialSampleID"])

In [17]:
#Add GEOME required columns
deerData=deerData.assign(measurementMethod="Unknown")
deerData=deerData.assign(basisOfRecord="PreservedSpecimen")

In [18]:
#create long version so that each trait has its own row

#creating long version, first specifiying keep variables, then naming variable and value
longVers=pd.melt(deerData, 
                id_vars=['occurrenceID',
                         'eventID',
                         'references',
                         'scientificName',
                         'catalogNumber',
                         'otherCatalogNumbers',
                         'sex',
                         'ageValue',
                         'ageUnit',
                         'verbatimAgeValue',
                         'lifeStage',
                         'country',
                         'ageEstimationMethod',
                         'stateProvince',
                         'locality',
                         'verbatimLocality',
                         'verbatimLatitude',
                         'verbatimLongitude',
                         'decimalLatitude',
                         'decimalLongitude',
                         'minimumElevationInMeters',
                         'maximumElevationInMeters',
                         'verbatimElevation',
                         'minimumDepthInMeters',
                         'maximumDepthInMeters',
                         'verbatimDepth',
                         'verbatimEventDate',
                         'yearCollected',
                         'samplingProtocol',
                         'eventRemarks',
                         'reproductiveCondition',
                         'measurementRemarks',
                         'measurementSide',
                         'measurementMethod',
                         'measurementAccuracy',
                         'measurementDeterminedDate',
                         'measurementDeterminedBy',
                         'basisOfRecord',
                         'materialSampleID'], 
                var_name = 'measurementType', 
                value_name = 'measurementValue')

In [19]:
#Populating measurementUnit column with appropriate measurement units in long version
longVers=longVers.assign(measurementUnit="")

for ind in longVers.index:
    if longVers['measurementType'][ind] == "body mass":
        longVers['measurementUnit'][ind]="g"
    else:
        longVers['measurementUnit'][ind]="mm"

In [20]:
#Create diagnosticID which is a unique number for each measurement
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

In [25]:
#If measurement value equals N/a or --, delete entire row
longVers = longVers.dropna(subset=['measurementValue'])
longVers = longVers[longVers.measurementValue != "--"]

#Drop first row of data, it contains no measurementValue but is still retained
longVers = longVers.drop(longVers.index[0])

In [26]:
#Writing long data csv file
longVers.to_csv('../Mapped Data/FuTRES_EAP_Deer_Emery.csv')