Data Wrangling Notebook for Hopkins 2008 Appendix 1 Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [93]:
import pandas as pd
import numpy as np
import uuid

Silence unnecessary errors

In [94]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Hopkins 2008 Appendix 1 Data

In [95]:
data = pd.read_csv("../Original_Data/Hopkins2008Appendix1.csv")

Add individualID column and populate with UUID

In [96]:
data = data.assign(individualID = '')
data['individualID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Clean up species column

In [97]:
data = data.assign(scientificName = data["Species"])

Adding additional required GEOME columns

In [98]:
data = data.assign(country = "Unknown", yearCollected = "Unknown", locality = "Unknown")

Set samplingProtocol and measurementMethod 

In [99]:
citation = "Hopkins, S. S. (2008). Reassessing the Mass of Exceptionally Large Rodents Using Toothrow Length and Area as Proxies for Body Mass. Journal of Mammalogy, 89(1), 232–243. https://doi.org/10.1644/06-mamm-a-306.1"

data = data.assign(samplingProtocol = citation, measurementMethod = citation)

Change MVZ specimen # to specimenID

In [100]:
data = data.assign(specimenID = data["MVZ specimen #"])

Adding GEOME required basisofRecord column

In [101]:
data = data.assign(basisOfRecord = "FossilSpecimen")

Rearrange columns so that template columns are first, followed by measurement values

In [102]:
# Create column list
cols = data.columns.tolist()

# Specify desired columns
cols = ['scientificName',
        'samplingProtocol',
        'measurementMethod',
        'individualID',
        'specimenID',
        'country',
        'yearCollected',
        'basisOfRecord',
        'locality',
        'mass',
        'M1 width (mm)',
        'LTRL (mm)']

# Subset dataframe
data = data[cols]

Matching template and column terms

In [105]:
data = data.rename(columns = {"mass": "body mass", 
                             "M1 width (mm)": "upper secondary molar tooth 1 occlusal surface width",
                             "LTRL (mm)": "lower tooth row length"})

Create a long version of the data frame

In [106]:
# Creating long version, first specifiying keep variables, then naming variable and value
longVers = pd.melt(data, 
                id_vars = ['scientificName',
                           'samplingProtocol',
                           'measurementMethod',
                           'individualID',
                           'specimenID',
                           'country',
                           'basisOfRecord',
                           'locality',
                           'yearCollected'], 
                            var_name = 'measurementType', 
                            value_name = 'measurementValue')

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). Create eventID. 

In [108]:
longVers = longVers.assign(materialSampleID = '')
longVers['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(longVers.index))]

longVers = longVers.assign(eventID = longVers['materialSampleID'])

Create diagnosticID

In [110]:
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

Add measurementUnit column

In [112]:
longVers = longVers.assign(measurementUnit = "")

long_body_mass_filter=longVers['measurementType']=="body mass"
long_no_body_filter=longVers['measurementType']!="body mass"

#Change "lb" to "g" and "in" to "mm". Conversion in next step
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"

If measurement value equals N/a, delete entire row

In [114]:
longVers = longVers.dropna(subset = ['measurementValue'])

Writing long data csv file

In [91]:
longVers.to_csv('../Mapped_Data/FuTRES_Hopkins_2008_Appendix_1_New_a.csv', index = False)