Data Wrangling Notebook for Famoso and Davis 2014 Supplemental Data 1
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [2]:
import pandas as pd
import numpy as np
import uuid

Silence unnecessary errors

In [3]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Famoso and Davis 2014 data

In [4]:
data = pd.read_csv("../Original_Data/Famoso_and_Davis_2014_Table_S1.csv")

Notes to go over:

In [None]:
#Specimen # has patterns I haven't seen before, make sure those are what you think they are.
#example: JODA 1079, A ; AMNH F:AM 60618 ; UOMNCH B-9092 ; USNM 416338/UF 17570 ; 

Create verbatimScientificName

In [None]:
data = data.assign(verbatimScientificName = data["Genus"] + " " + data["Species"])
data = data.assign(scientificName = data["scientificName"])

Adding additional required GEOME columns

In [None]:
data = data.assign(country = "USA", yearCollected = "Unknown", stateProvinence = data["State"])

Set samplingProtocol and measurementMethod 

In [None]:
citation = "Famoso, N. A., &amp; Davis, E. B. (2014). Occlusal enamel complexity in Middle Miocene to HOLOCENE equids (equidae: Perissodactyla) of North America. PLoS ONE, 9(2). doi:10.1371/journal.pone.0090184"

data = data.assign(samplingProtocol = citation, measurementMethod = citation)

Rearrange columns so that template columns are first, followed by measurement values

In [None]:
# Create column list
cols = data.columns.tolist()

# Specify desired columns
cols = ['Formation',
        'scientificName',
        'samplingProtocol',
        'measurementMethod',
        'Member',
        'institutionCode',
        'collectionCode',
        'catalogNumber',
        'country']

# Subset dataframe
data = data[cols]

Matching template and column terms

In [None]:
data = data.rename(columns = {'Formation':'formation',
                              'Member':'member'})

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). 

In [None]:
data = data.assign(materialSampleID = '')
data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Create a long version of the data frame

In [None]:
# Creating long version, first specifiying keep variables, then naming variable and value
longVers = pd.melt(data, 
                id_vars = ['country',
                           'stateProvience',
                           'yearCollected',
                           'scientificName',
                           'samplingProtocol',
                           'measurementMethod',
                           'institutionCode',
                           'collectionCode',
                           'catalogNumber',
                           'formation',
                           'member',
                           'bed'], 
                            var_name = 'measurementType', 
                            value_name = 'measurementValue')

Create diagnosticID which is a unique number for each measurement

In [None]:
longVers = longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

If measurement value equals N/a, delete entire row

In [None]:
longVers = longVers.dropna(subset = ['measurementValue'])

Writing long data csv file

In [None]:
longVers.to_csv('../Mapped_Data/Famoso_David_2014.csv', index = False)