Data Wrangling Notebook for Reuter Supplemental Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [65]:
import pandas as pd
import numpy as np
import uuid

Silence unnecessary errors

In [66]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Hopkins 2008 Appendix 1 Data

In [67]:
data = pd.read_csv("../Original_Data/Reuter_Supplemental Data S1.csv")

Add individualID column and populate with UUID

In [68]:
data = data.assign(individualID = '')
data['individualID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Clean up species column

In [69]:
data = data.assign(scientificName = data["Species"])

Adding additional required GEOME columns
<br />
TODO: Check publication for these 

In [70]:
data = data.assign(country = "Unknown", yearCollected = "Unknown")

Set samplingProtocol and measurementMethod 
<br />
TODO: Check publication associated with this data 

In [71]:
citation = "Famoso, N. A., &amp; Davis, E. B. (2014). Occlusal enamel complexity in Middle Miocene to HOLOCENE equids (equidae: Perissodactyla) of North America. PLoS ONE, 9(2). doi:10.1371/journal.pone.0090184"

data = data.assign(samplingProtocol = citation, measurementMethod = citation)

Change Museum to institutionCode and Specimen.Number to specimenID

In [72]:
data = data.assign(specimenID = data["Specimen.Number"], institutionCode = data["Museum"])

Adding GEOME required basisofRecord column

In [73]:
data = data.assign(basisOfRecord = "FossilSpecimen")

Adding GEOME required locality column

In [74]:
data = data.assign(locality = "Unknown")

Rearrange columns so that template columns are first, followed by measurement values

In [75]:
# Create column list
cols = data.columns.tolist()

# Specify desired columns
cols = ['scientificName',
        'samplingProtocol',
        'measurementMethod',
        'individualID', 
        'specimenID',
        'institutionCode',
        'country',
        'locality',
        'yearCollected',
        'basisOfRecord',
        'm1L',
        'm1W',
        'm2L',
        'm2W',
        'm3L',
        'm3W',
        'p1L',
        'p1W',
        'p2L',
        'p2W',
        'p3L',
        'p4L',
        'p4W',
        'M1L',
        'M1W',
        'M2L',
        'M2W',
        'P1L',
        'P1W',
        'P2L',
        'P2W',
        'P3L',
        'P3W',
        'P4L',
        'P4W']

# Subset dataframe
data = data[cols]

Matching template and column terms

In [76]:
data = data.rename(columns = {'m1L':'lower secondary molar tooth 1 occlusal surface length',
                              'm1W':'lower secondary molar tooth 1 occlusal surface width',
                              'm2L':'lower secondary molar tooth 2 occlusal surface length',
                              'm2W':'lower secondary molar tooth 2 occlusal surface width',
                              'm3L':'lower secondary molar tooth 3 occlusal surface length',
                              'm3W':'lower secondary molar tooth 3 occlusal width',
                              'p1L':'lower secondary premolar tooth 1 occlusal surface length',
                              'p1W':'lower secondary premolar tooth 1 occlusal surface width',
                              'p2L':'lower secondary premolar tooth 2 occlusal surface length',
                              'p2W':'lower secondary premolar tooth 2 occlusal surface width',
                              'p3L':'lower secondary premolar tooth 3 occlusal surface length',
                              'p3W':'lower secondary premolar tooth 3 occlusal surface width',
                              'p4L':'lower secondary premolar tooth 4 occlusal surface length',
                              'p4W':'lower secondary premolar tooth 4 occlusal surface width',
                              'M1L':'upper secondary molar tooth 1 occlusal surface length',
                              'M1W':'upper secondary molar tooth 1 occlusal surface width',
                              'M2L':'upper secondary molar tooth 2 occlusal surface length',
                              'M2W':'upper secondary molar tooth 2 occlusal surface width',
                              'P1L':'upper secondary premolar tooth 1 occlusal surface length',
                              'P1W':'upper secondary premolar tooth 1 occlusal surface width',
                              'P2L':'upper secondary premolar tooth 2 occlusal surface length',
                              'P2W':'upper secondary premolar tooth 2 occlusal surface width',
                              'P3L':'upper secondary premolar tooth 3 occlusal surface length',
                              'P3W':'upper secondary premolar tooth 3 occlusal surface width',
                              'P4L':'upper secondary premolar tooth 4 occlusal surface length',
                              'P4W':'upper secondary premolar tooth 4 occlusal surface width'})

Create a long version of the data frame

In [77]:
# Creating long version, first specifiying keep variables, then naming variable and value
longVers = pd.melt(data, 
                id_vars = ['scientificName',
                           'samplingProtocol',
                           'measurementMethod',
                           'individualID',
                           'institutionCode',
                           'locality',
                           'specimenID',
                           'basisOfRecord',
                           'country',
                           'yearCollected'], 
                            var_name = 'measurementType', 
                            value_name = 'measurementValue')

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes).

In [78]:
longVers = longVers.assign(materialSampleID = '')
longVers['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(longVers.index))]

longVers = longVers.assign(eventID = longVers['materialSampleID'])

Create measurementUnit column and populate with "mm"

In [None]:
longVers = longVers.assign(measurementUnit = 'mm')

Create diagnosticID and populate it with uuid

In [None]:
longVers = longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

If measurement value equals N/a, delete entire row

In [79]:
longVers = longVers.dropna(subset = ['measurementValue'])

Writing long data csv file

In [80]:
longVers.to_csv('../Mapped_Data/Reuter_Supplemental_Data_S1.csv', index = False)