Data Wrangling Notebook for JArroyo-Cabrales data 
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [115]:
import pandas as pd
import numpy as np
import uuid

Silence unnecessary errors

In [116]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Alberdi Appendix 

In [117]:
data = pd.read_csv("../Original_Data/Alberdi_combined.csv")

Add individualID and populate with UUID

In [118]:
data = data.assign(individualID = '')
data['individualID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Set samplingProtocol and measurementMethod 

In [119]:
citation = "Alberdi, M.T., Arroyo-Cabrales, J., Marín-Leyva, A.H., Polaco, O.J., 2014, Study of Cedral Horses and their place in the Mexican Quaternary: Revista Mexicana de Ciencias Geológicas, v. 31, núm. 2, p. 221-237"

data = data.assign(samplingProtocol = citation, measurementMethod = citation)

Adding additional required GEOME columns

In [120]:
data = data.assign(country = "Mexico", yearCollected = "Unknown", locality = "Unknown", basisOfRecord = "FossilSpecimen")

Renaming columns

In [121]:
data = data.rename(columns = {'specimenType':'skeletalElement',
                              'side':'measurementSide',
                              'sitename':'locality',
                              'test': 'measurementType',
                              'reference':'measurementMethod', 
                              'ScienticName': 'scientificName'})

Create a long version of the data frame

In [122]:
id_vars = ['Bone', 'catalogNumber', 'scientificName', 'individualID',
           'samplingProtocol', 'measurementMethod', 'country', 'yearCollected', 
           'locality', 'basisOfRecord']

long_data = pd.melt(data, id_vars = id_vars, var_name = 'measurementType', value_name = 'measurementValue')

Combine bone data with measurementType column

In [123]:
long_data["measurementType"] = long_data["Bone"] + " " + long_data["measurementType"]
del long_data["Bone"]

In [130]:
long_data["measurementType"][long_data["measurementType"].str.contains('M1-2')].unique()

array(['M1-2 upper surface length', 'M1-2 upper surface breadth',
       'M1-2 upper base length', 'M1-2 upper base breadth',
       'M1-2 upper height', 'M1-2 upper protocone length',
       'M1-2 upper postflexid length', 'M1-2 upper double-knot length',
       'M1-2 upper m1', 'M1-2 upper m2', 'M1-2 upper m3', 'M1-2 upper m4',
       'M1-2 upper m5', 'M1-2 upper m6', 'M1-2 upper m7', 'M1-2 upper m8',
       'M1-2 upper m9', 'M1-2 upper m10', 'M1-2 upper m11',
       'M1-2 upper m12', 'M1-2 upper m13', 'M1-2 upper m14',
       'M1-2 upper m16', 'M1-2 upper body size McIII13',
       'M1-2 upper maximal breadth', 'M1-2 upper maximal height',
       'M1-2 upper maximal depth', 'M1-2 upper proximal length',
       'M1-2 upper proximal breadth', 'M1-2 upper distal length',
       'M1-2 upper distal breadth', 'M1-2 upper external height',
       'M1-2 upper Unnamed: 35'], dtype=object)

Renaming measurementType values

In [99]:
# Replace names of terms avaliable in GEOME
# NOTE: Make sure mapping file is up to date before reprocessing (git pull from FuTRES Repo)

# Read mapping file 
mapping_file = pd.read_csv("./../Mapping Files/ontology_codeBook.csv")

# Create subset of those within FOVT or OBA
map_subset = mapping_file[(mapping_file["Status"] == "in FOVT") | (mapping_file["Status"] == "in OBA") ]

# Create a subset of Joaquin data
joaquin_subset = map_subset[map_subset["name"] == "Joaquin"]

# Isolating necessary columns
joaquin_subset = joaquin_subset[["label", "term"]]

# Create dictionary of terms
map_dict = pd.Series(joaquin_subset.term.values,index = joaquin_subset.label).to_dict()



# Map the new terms onto the old terms in the dataframe 
long_data["measurementType"] = (pd.Series(long_data["measurementType"])).map(map_dict)
#long_data["measurementValue"] = long_data.measurementValue.dropna()

Create materialSampleID which is a UUID for each measurement. Populate eventID with materialSampleID

In [113]:
long_data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(long_data.index))]
long_data = long_data.assign(eventID = long_data["materialSampleID"])

Create diagnosticID

In [74]:
# Create diagnosticID which is a UUID for each measurement
long_data['diagnosticID'] = [uuid.uuid4() for _ in range(len(long_data.index))]

Write file to csv

In [75]:
#Writing the final dataframe as csv file
long_data.to_csv('../Mapped_Data/FuTRES_Equidae_Arroyo-Cabrales_Mexico_Paleo.csv')

In [77]:
long_data["measurementType"].unique()

array([nan], dtype=object)