Data Wrangling Notebook for EAP Deer Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [24]:
import pandas as pd
import numpy as np 
import uuid
import re

Silencing warnings that are unnecessary

In [25]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import EAP Florida Modern Deer Data

In [26]:
deerData = pd.read_csv("../Original_Data/EAP_Florida_Modern_Deer_Measurements_FORFUTRES_1_23_2020.csv")

Create GEOME required ageUnit column and populate it with "year"

In [27]:
deerData = deerData.assign(ageUnit = "year")

Create yearCollected column and populate it with extracted year from eventDate

In [28]:
deerData = deerData.assign(yearCollected = "")

def find_year(date):
    """Converts scientific name to binomial nomenclature format"""
    date = str(date)
    slash = re.compile('/')
    
    #Finds '/' in string and assigns last value to year 
    if slash.findall(date):
        date = date.split('/')
        return date[2]

# Assign year to yearCollected column
deerData["yearCollected"]  = deerData["eventDate"].apply(find_year)

Create individualID column and populate it with UUID value

In [29]:
deerData=deerData.assign(individualID = '')
deerData['individualID'] = [uuid.uuid4().hex for _ in range(len(deerData.index))]

Standardize Sex Column

In [30]:
female = deerData['sex']=="F"
male = deerData['sex'] == "M"
deerData['sex'][(female == False) & (male == False)] = "not collected"
deerData['sex'][female == True] = "female"
deerData['sex'][male == True] = "male"

Stadardize Side Column

In [31]:
right = deerData['Side'] == "R"
left = deerData['Side'] == "L"
deerData['Side'][(right == False) & (left == False)] = ""
deerData['Side'][right == True] = "right"
deerData['Side'][left == True] = "left"

Set up verbatimAge column

In [32]:
deerData["verbatimAgeValue"] = deerData['ageValue']

Standardize Reproduction Condition column

In [33]:
reproduction_data=deerData["reproductiveCondition"]
not_pregnant_filter = reproduction_data.str.contains("not|non")
pregnant_filter = reproduction_data.str.contains("fetus|several")
space_filter = reproduction_data.str.contains("--")

# Reassigning data
deerData["reproductiveCondition"][not_pregnant_filter==True]="non-reproductive"
deerData["reproductiveCondition"][pregnant_filter==True]="pregnant"
deerData["reproductiveCondition"][space_filter==True]=""

Standardize lifeStage column

In [34]:
young_adult = deerData['lifeStage'] == "young adult"
dash_age = deerData['lifeStage'] == "--"
deerData['lifeStage'][young_adult == True] = "adult"
deerData['lifeStage'][dash_age == True] = ""

Select specified columns for final dataset

In [35]:
#Create column list
cols = deerData.columns.tolist()

#Specify desired columns
cols = ['occurrenceId',
        'references',
        'individualID',
        'scientificName',
        'catalogNumber',
        'sex',
        'ageValue',
        'lifeStage',
        'occurrenceRemarks',
        'ageUnit',
        'country',
        'stateProvince',
        'locality',
        'verbatimLocality',       
        'decimalLatitude',
        'decimalLongitude',
        'verbatimEventDate',
        'yearCollected',
        'samplingProtocol',
        'reproductiveCondition',
        'eventRemarks',
        'Side',
        'Total Fresh Weight (g)',
        'Height (mm) [define?]',
        'TL (mm) [Total Length]',
        'HF (mm) [Hind Foot Length]',
        'TA (mm) [Tail Length]',
        'En (mm) [Ear Notch = Ear Length]',
#        'Calcaneus GL, (greatest length, von den Driesch, 1976), mm',
#        'Calcaneus GB, (greatest breadth, von den Driesch 1976), mm',
        'Tibia GL, (greatest length, von den Driesch 1976), mm',
#        'Astragalus greatest depth medial',
#        'Astragalus Dm; Astragalus Dm (greatest depth of the medial side, von den Driesch 1976), mm',
#        'Astragalus GLI; Astragalus GLl, (greatest length of the lateral side, von den Driesch 1976), mm; Astragalus greatest length lateral',
#         'Maximal medial depth',
#         'Astragalus distal articular breadth',
#         'Breadth astragalus of facies articularis distalis',
#         'Breadth of astragalus distal articular surface',
#         'Tibia SD; Tibia SD, (smallest breadth of the diaphysis, von den Driesch 1976), mm',
#         'Tibia Ll; Tibia Ll, (length of the lateral side, von den Driesch 1976 *note she says only in horses), mm',
#         'Tibia Bp; Tibia Bp, (greatest breadth of the proximal end, von den Driesch 1976), mm',
#         'Tibia Bd; Tibia Bd, (greatest breadth of the distal end, von den Driesch 1976), mm',
#         'Tibia Dd; Tibia Dd, (greatest depth of the distal end, von den Driesch 1976), mm',
#          'Humerus.GLC',
#          'Astragalus GH',
#          'Astragalus Bfd',
#          'Astragalus distal articular depth',
        'Measurement Remarks',
        'Measurements by',
        'Measurement Date',
        'Measurement Method',
        'Measurement Accuracy',
        'otherCatalogNumbers']

#Subset dataframe
deerData = deerData[cols]

Matching column names to template 

In [36]:
#Matching template and column terms

#Renaming columns 
deerData = deerData.rename(columns = {'occurrenceId':'occurrenceID',
                                      'ageValue':'verbatimAgeValue',
                                      'occurrenceRemarks':'ageEstimationMethod',
                                      'Side': 'measurementSide',
                                      'Measurement Remarks': 'measurementRemarks',
                                      'Measurements by': 'measurementDeterminedBy',
                                      'Measurement Date': 'measurementDeterminedDate',
                                      'Measurement Method': 'measurementMethod',
                                      'Measurement Accuracy': 'measurementAccuracy'})

In [37]:
#Matching trait and ontology terms

#Renaming columns
deerData = deerData.rename(columns={'Total Fresh Weight (g)': 'body mass',
                                    'Height (mm) [define?]': 'body height',
                                    'TL (mm) [Total Length]': 'body length with tail',
                                    'TA (mm) [Tail Length]' : 'tail length',
                                    'HF (mm) [Hind Foot Length]': 'pes length',
                                    'En (mm) [Ear Notch = Ear Length]': 'ear length to notch',
#                                     'Calcaneus GL, (greatest length, von den Driesch, 1976), mm':'calcaneus length',
#                                     'Calcaneus GB, (greatest breadth, von den Driesch 1976), mm':'breadth of calcaneus body',
#                                     'Humerus.GLC':'humerus length from trochlea to caput',
#                                     #'Astragalus greatest depth medial':'talus medial depth',
#                                     'Maximal medial depth':'talus medial depth',
#                                     'Astragalus GH':'talus length',
#                                     'Tibia Bd; Tibia Bd, (greatest breadth of the distal end, von den Driesch 1976), mm':'tibia distal breadth',
#                                     'Tibia Dd; Tibia Dd, (greatest depth of the distal end, von den Driesch 1976), mm':'tibia distal depth',
#                                     'Tibia Bp; Tibia Bp, (greatest breadth of the proximal end, von den Driesch 1976), mm':'tibia proximal breadth',
#                                     'Tibia SD; Tibia SD, (smallest breadth of the diaphysis, von den Driesch 1976), mm':'tibia diaphysis breadth',
#                                     'Breadth of astragalus distal articular surface':'talus distal articular breadth (non-plantigrade)',
#                                     'Astragalus Bfd':'talus distal articular breadth (non-plantigrade)',
#                                     'Astragalus distal articular depth':'talus distal articular breadth (non-plantigrade)',
#                                     'Astragalus distal articular breadth':'talus distal articular breadth (non-plantigrade)',
#                                     'Tibia Ll; Tibia Ll, (length of the lateral side, von den Driesch 1976 *note she says only in horses), mm':'tibia lateral length',
#                                     'Breadth astragalus of facies articularis distalis':'talus distal articular breadth (non-plantigrade)',
                                    #'Astragalus Dm; Astragalus Dm (greatest depth of the medial side, von den Driesch 1976), mm':'talus medial depth',
                                    #'Astragalus GLI; Astragalus GLl, (greatest length of the lateral side, von den Driesch 1976), mm; Astragalus greatest length lateral':'talus lateral length',
                                    'Tibia GL, (greatest length, von den Driesch 1976), mm':'tibia length'})


Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). Create necessary eventID column and populate with materialSampleID column


In [38]:
deerData=deerData.assign(materialSampleID = '')
deerData['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(deerData.index))]

deerData=deerData.assign(eventID = deerData["materialSampleID"])

Add GEOME required columns

In [39]:
deerData=deerData.assign(measurementMethod="Unknown")
deerData=deerData.assign(basisOfRecord="PreservedSpecimen")

Create a long version of the data frame

In [40]:
#Creating long version, first specifiying keep variables, then naming variable and value
longVers=pd.melt(deerData, 
                id_vars=['occurrenceID',
                         'eventID',
                         'references',
                         'individualID',
                         'scientificName',
                         'catalogNumber',
                         'otherCatalogNumbers',
                         'sex',
                         'verbatimAgeValue',
                         'ageUnit',
                         'lifeStage',
                         'country',
                         'ageEstimationMethod',
                         'stateProvince',
                         'locality',
                         'verbatimLocality',
                         'decimalLatitude',
                         'decimalLongitude',
                         'verbatimEventDate',
                         'yearCollected',
                         'samplingProtocol',
                         'eventRemarks',
                         'reproductiveCondition',
                         'measurementRemarks',
                         'measurementSide',
                         'measurementMethod',
                         'measurementAccuracy',
                         'measurementDeterminedDate',
                         'measurementDeterminedBy',
                         'basisOfRecord',
                         'materialSampleID'], 
                var_name = 'measurementType', 
                value_name = 'measurementValue')

Populating measurementUnit column with appropriate measurement units in long version

In [41]:
longVers=longVers.assign(measurementUnit="")

#Create filters
long_body_mass_filter=longVers['measurementType']=="body mass"
long_no_body_filter=longVers['measurementType']!="body mass"

#Assign units using filters
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"

Create diagnosticID which is a unique number for each measurement

In [42]:
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

If measurement value equals N/a or --, delete entire row

In [43]:
longVers = longVers.dropna(subset=['measurementValue'])
longVers = longVers[longVers.measurementValue != "--"]
longVers = longVers[longVers.measurementValue != "unf."]
longVers = longVers[longVers.measurementValue != "UNF"]

Write file as csv for GEOME upload

In [44]:
#Writing long data csv file
longVers.to_csv('../Mapped_Data/FuTRES_EAP_Deer_Emery.csv')