Data Wrangling Notebook for HMachado Equus Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [1]:
import pandas as pd
import uuid

Silencing warnings that are unnecessary

In [2]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Horse Data

In [3]:
# Importing horse data from a comma delimited file
data = pd.read_csv("../Original_Data/Horse data_Helena.csv")

Tidy decimalLatitude and decimalLongitude 

In [4]:
# Cleaning decimalLatitude by removing S and making value negative
for ind in data.index:
    a = data['decimalLatitude'][ind]
    b = str(a)
    c=b[:-1]
    c="-"+c
    data['decimalLatitude'][ind] = c

# Cleaning decimalLongitude by removing W and making value negative
for ind in data.index:
    a = data['decimalLongitude'][ind]
    b = str(a)
    c=b[:-1]
    c="-"+c
    data['decimalLongitude'][ind] = c

na_long = data['decimalLatitude']=="-na"
na_lat = data['decimalLongitude']=="-na"
data['decimalLatitude'][(na_long == True)]= ""
data['decimalLongitude'][(na_lat == True)]= ""


Combining SpecimenType to MeasurementType Columns

In [5]:
data['test'] = data['specimenType'].str.cat(data['measurementType'])

Standardizing basisOfRecord and measurementUnit columns

In [7]:
# Standardization from fossil to FossilSpecimen
data.loc[data['basisOfRecord'] == 'fossil', 'basisOfRecord'] = 'FossilSpecimen'
#horseData['basisOfRecord']

# Measurement unit from millimeters to mm
data.loc[data['measurementUnit'] == 'millimeters', 'measurementUnit'] = 'mm'
#horseData['measurementUnit']

Rearrange columns so that template columns are first, followed by measurement values

In [31]:
# Create column list
cols = data.columns.tolist()

# Specify desired columns
cols = ['institutionCode',
        'collectionCode',
        'specimenID',
        'side',
        'test',
        'scientificName',
        'decimalLatitude',
        'decimalLongitude',
        'sitename',
        'verbatimLocality',
        'basisOfRecord',
        'measurementValue',
        'measurementUnit',
        'lithostratigraphicTerms',
        'formation',
        'member',
        'references']

#Subset dataframe
data = data[cols]

Matching template and column terms

In [32]:
#Renaming columns 
data = data.rename(columns = {'specimenID':'catalogNumber',
                              'specimenType':'skeletalElement',
                              'side':'measurementSide',
                              'sitename':'locality',
                              'test': 'measurementType',
                              'reference':'measurementMethod'})

Replace names of terms avaliable in GEOME and subset data by avaliable terms

In [33]:
# Replace names of terms avaliable in GEOME
data["measurementType"] = data["measurementType"].replace({''})

# Avaliable terms list
avaliable_traits = ['']

# Subset by avaliable terms
data = data[data['measurementType'].isin(avaliable_traits)]

Create materialSampleID which is a UUID for each measurement. Populate eventID with materialSampleID

In [34]:
data=data.assign(materialSampleID = '')
data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(horseData.index))]

data = data.assign(eventID = data["materialSampleID"])

Create GEOME required columns

In [35]:
data["scientificName"] = data["scientificName"].fillna("Unknown")
data = data.assign(measurementMethod = "unknown")
data = data.assign(country = "unknown")
data = data.assign(yearCollected = "unknown")
data = data.assign(samplingProtocol = "unknown")

Create diagnosticID

In [36]:
#create diagnosticID which is a UUID for each measurement
data = data.assign(diagnosticID = '')
data['diagnosticID'] = [uuid.uuid4() for _ in range(len(data.index))]

Write file to csv

In [37]:
#Writing the final dataframe as csv file
data.to_csv('../Mapped_Data/FuTRES_Equus_HMachado_Americas_paleo.csv')