Data Wrangling Notebook for JArroyo-Cabrales data 
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

NOTE: Waiting for codebook to be updated before proceding 

In [180]:
import pandas as pd
import numpy as np
import uuid

Silence unnecessary errors

In [181]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Alberdi Appendix 

In [182]:
data = pd.read_csv("../Original_Data/Alberdi_combined.csv")

Add individualID and populate with UUID

In [183]:
data = data.assign(individualID = '')
data['individualID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Set samplingProtocol and measurementMethod 

In [184]:
citation = "Alberdi, M.T., Arroyo-Cabrales, J., Marín-Leyva, A.H., Polaco, O.J., 2014, Study of Cedral Horses and their place in the Mexican Quaternary: Revista Mexicana de Ciencias Geológicas, v. 31, núm. 2, p. 221-237"

data = data.assign(samplingProtocol = citation, measurementMethod = citation)

Adding additional required GEOME columns

In [185]:
data = data.assign(country = "Mexico", yearCollected = "Unknown", locality = "Unknown", basisOfRecord = "FossilSpecimen")

Renaming columns

In [186]:
data = data.rename(columns = {'specimenType':'skeletalElement',
                              'side':'measurementSide',
                              'sitename':'locality',
                              'test': 'measurementType',
                              'reference':'measurementMethod', 
                              'ScienticName': 'scientificName'})

Create a long version of the data frame

In [187]:
id_vars = ['Bone', 'catalogNumber', 'scientificName', 'individualID',
           'samplingProtocol', 'measurementMethod', 'country', 'yearCollected', 
           'locality', 'basisOfRecord']

long_data = pd.melt(data, id_vars = id_vars, var_name = 'measurementType', value_name = 'measurementValue')

Combine bone data with measurementType column

In [188]:
long_data["measurementType"] = long_data["Bone"] + " " + long_data["measurementType"]
del long_data["Bone"]

Renaming measurementType values

In [189]:
# Replace names of terms avaliable in GEOME
# NOTE: Make sure mapping file is up to date before reprocessing (git pull from FuTRES Repo)

# Read mapping file 
mapping_file = pd.read_csv("./../Mapping Files/ontology_codeBook.csv")

# Create subset of those within FOVT or OBA
map_subset = mapping_file[(mapping_file["Status"] == "in FOVT") | (mapping_file["Status"] == "in OBA") ]

# Create a subset of Joaquin data
joaquin_subset = map_subset[map_subset["name"] == "Joaquin"]

# Isolating necessary columns
joaquin_subset = joaquin_subset[["label", "term"]]

# Explode codebook subset
#joaquin_subset = joaquin_subset.assign(label = joaquin_subset["label"].str.split(';')).explode("label")

# Create dictionary of terms
map_dict = joaquin_subset.set_index('label').to_dict()['term']
# map_dict = pd.Series(joaquin_subset.term.value,index = joaquin_subset.label).to_dict()

# Map the new terms onto the old terms in the dataframe 
long_data["measurementType"] = (pd.Series(long_data["measurementType"])).map(map_dict)

Create materialSampleID which is a UUID for each measurement. Populate eventID with materialSampleID

In [190]:
long_data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(long_data.index))]
long_data = long_data.assign(eventID = long_data["materialSampleID"])

Create diagnosticID

In [191]:
# Create diagnosticID which is a UUID for each measurement
long_data['diagnosticID'] = [uuid.uuid4() for _ in range(len(long_data.index))]

Adding measurementUnit column

In [192]:
long_data['measurementUnit'] = "mm"

Dropping blank rows

In [193]:
long_data = long_data.dropna(subset=['measurementType'])
long_data = long_data.dropna(subset=['measurementValue'])

Write file to csv

In [194]:
#Writing the final dataframe as csv file
long_data.to_csv('../Mapped_Data/FuTRES_Equidae_Arroyo-Cabrales_Mexico_Paleo.csv')