Data Wrangling Notebook for Aepyceros Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [77]:
import pandas as pd
import numpy as np
import uuid
import json 

Silencing warnings that are unnecessary

In [78]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Aepyceros Data 

In [79]:
aepyceros = pd.read_csv("../Original_Data/Aepyceros.csv")

Set dynamic properties 

In [86]:
#aepyceros["dynamicProperties"] = aepyceros[["Location Code", "Notes"]].to_json()


aepyceros['dynamicProperties'] = aepyceros[["Location Code", "Notes"]].apply(lambda x: x.to_json(), axis=1)

0      {"Location Code":{"0":"B1","1":"S3","2":"A1","...
1      {"Location Code":{"0":"B1","1":"S3","2":"A1","...
2      {"Location Code":{"0":"B1","1":"S3","2":"A1","...
3      {"Location Code":{"0":"B1","1":"S3","2":"A1","...
4      {"Location Code":{"0":"B1","1":"S3","2":"A1","...
                             ...                        
101    {"Location Code":{"0":"B1","1":"S3","2":"A1","...
102    {"Location Code":{"0":"B1","1":"S3","2":"A1","...
103    {"Location Code":{"0":"B1","1":"S3","2":"A1","...
104    {"Location Code":{"0":"B1","1":"S3","2":"A1","...
105    {"Location Code":{"0":"B1","1":"S3","2":"A1","...
Name: dynamicProperties, Length: 106, dtype: object

Create verbatimAgeValue and modify lifeStage column

In [53]:
# Assign aepyceros["Age (juv, prime adult, older adult, old)"] to verbatimAgeValue
aepyceros = aepyceros.assign(verbatimAgeValue = aepyceros["Age (juv, prime adult, older adult, old)"])

# add lifeStage column
aepyceros=aepyceros.assign(lifeStage="")

adult_filter=aepyceros["Age (juv, prime adult, older adult, old)"].str.contains("Prime|Old|Young|Very|No")
juv_filter=aepyceros["Age (juv, prime adult, older adult, old)"].str.contains("juvenile|Juvenile")

aepyceros["lifeStage"] = aepyceros['Age (juv, prime adult, older adult, old)'].fillna("Not Collected")
aepyceros["lifeStage"][adult_filter==True] = "adult"
aepyceros["lifeStage"][juv_filter==True] = "juvenile"

Modify sex column

In [54]:
# Sex column modification
aepyceros['SEX'] = aepyceros['SEX'].str.lower()

GEOME requires certain columns. Columns created and if unknown assign "unknown"

In [55]:
# Add GEOME required columns 
aepyceros=aepyceros.assign(basisOfRecord="FossilSpecimen")
aepyceros=aepyceros.assign(locality="Unknown")
aepyceros=aepyceros.assign(samplingProtocol="Unknown")
aepyceros=aepyceros.assign(yearCollected="Unknown")
aepyceros=aepyceros.assign(measurementMethod="Unknown")

Select specified columns for final dataset

In [56]:
#Create column list
cols = aepyceros.columns.tolist()

#Specify desired columns
cols = ['Museum','Specimen #','Species','SEX','Country/Continent','State/Province',
        'lifeStage','verbatimAgeValue','locality','basisOfRecord','samplingProtocol',
        'yearCollected','measurementMethod','Humerus Length','Weight','Humerus Width Shaft  AP',
        'Humerus Width Shaft ML','Humerus Width Distal 1 (capitulum)',
        'Humerus Width Distal 2 (epicondyles)','Humerus Width Proximal (humeral head)',
        'Femur Width Distal 1 (region above condyles and below patellar surface)',
        'Femur Length','Femur Width Shaft AP','Femur Width ML','Femur Width Distal 2 (condyles)',
        'Medapodial Length','Medapodial Width AP','Medapodial Width ML','Astragalus Length',
        'Astragalus Width', 'Occlusal length M3', 'Occlusal width M3',
        'Occlusal width M3_ remeasured 11_2016', 'Occlusal length M2',
        'Occlusal length M2_remeasured 11_2016', 'Occlusal width M2',
        'Occlusal width M2_remeasured 11_2016', 'Occlusal length M1',
        'Occlusal width M1', 'Occlusal length P4', 'Occlusal Width P4',
        'Occlusal length P2', 'Occlusal Width P2', 'Occlusal length M3_remeasured 11_2016', 
        'Occlusal width M3_remeasured 11_2016', 'Occlusal length M2_remeasured 11_2016.1', 
        'Occlusal width M2_remeasured 11_2016.1', 'Occlusal length M1_remeasured 11_2016', 
        'Occlusal width M1_remeasured 11_2016', 'Occlusal width P4', 
       ]

#Subset dataframe
aepyceros = aepyceros[cols]

Matching column names to template 

In [57]:
#Matching template and column terms

#Renaming columns 
aepyceros = aepyceros.rename(columns = {'Museum':'institutionCode',
                                        'Specimen #':'individualID',
                                        'Species':'scientificName',
                                        'SEX':'sex',
                                        'Country/Continent':'country',
                                        'State/Province':'stateProvince'})

Creating necessary measurementUnit column

In [58]:
#Create measurementUnit column
aepyceros=aepyceros.assign(measurementUnit="")

Creating verbatimScientificName and modifying scientificName

In [59]:
# Create verbatim scientificName [not accepted by GEOME yet]
#aepyceros=aepyceros.assign(verbatimScientificName = aepyceros["scientificName"]) 

def clean_name(name):
    """Converts scientific name to binomial nomenclature format"""
    name = str(name).split()
    new_name = " ".join(name[:-1])
    return new_name

# Clean scientificName
aepyceros["scientificName"]  = aepyceros["scientificName"].apply(clean_name)

Create a long version of the data frame

In [60]:
#Creating long version, first specifiying keep variables, then naming variable and value
long_data=pd.melt(aepyceros, 
                id_vars=['institutionCode',
                         'individualID',
                         'scientificName',
                         #'verbatimScientificName',
                         'sex',
                         'country',
                         'stateProvince',
                         'lifeStage',
                         'verbatimAgeValue',
                         'basisOfRecord',
                         'locality',
                         'samplingProtocol',
                         'yearCollected',
                         'measurementMethod',
                         'measurementUnit'], 
                var_name = 'measurementType', 
                value_name = 'measurementValue')

Matching trait names to ontology terms

In [61]:
mapping_file = pd.read_csv("./../Mapping Files/ontology_codeBook.csv")

# Create subset of those within FOVT or OBA
map_subset = mapping_file[(mapping_file["Status"] == "in FOVT") | (mapping_file["Status"] == "in OBA") ]

# Create a subset of Ray data
aep_subset = map_subset[map_subset["name"] == "aepyceros"]

# Isolating necessary columns
aep_subset = aep_subset[["bone","label", "term"]]

# Create dictionary of terms
map_dict = map_dict = dict(zip(aep_subset.label , aep_subset.term))

# Map the new terms onto the old terms in the dataframe 
long_data["measurementType"] = long_data["measurementType"].map(map_dict)
long_data = long_data.dropna(subset=['measurementType','measurementValue'])

Assigning materialSampleID for each bone

In [62]:
map_dict = dict(zip(aep_subset.term , aep_subset.bone))

long_data["temp_bone"] = long_data["measurementType"].map(map_dict)
long_data["materialSampleID"] = long_data.groupby("temp_bone").ngroup()
long_data = long_data.drop("temp_bone", axis = 1)

Populating measurementUnit column with appropriate measurement units in long version

In [63]:
# Assigning appropriate units (lb changed to g, in changed to mm)
long_body_mass_filter=long_data['measurementType']=="body mass"
long_no_body_filter=long_data['measurementType']!="body mass"
long_data['measurementUnit'][long_body_mass_filter] = "g"
long_data['measurementUnit'][long_no_body_filter] = "mm"

Remove measurementValue rows that do not have an assigned value

In [76]:
# If measurement value equals N/a, delete entire row
long_data = long_data.dropna(subset=['measurementValue'])
long_data = long_data.drop(long_data.index[0])

Converting and cleaning measurementValue column

In [71]:
# Creating verbatimMeasurementUnit [currently not accepted by GEOME]
#longVers=longVers.assign(verbatimMeasurementValue = longVers["measurementValue"])

#Drop values that contains a - or a ~
long_data = long_data[long_data["measurementValue"].str.contains("-") == False ]
long_data = long_data[long_data["measurementValue"].str.contains("~") == False ]

array(['78 lb', '112 lb', '2400 oz/ 150lbs', '70lbs', '48.2', '10.29',
       '7.71', '8.96', '7.93', '10.01', '8.73', '10.5', '9.3', '8.55',
       '9.45', '8.72', '10.03', '9.87', '6.45', '9.82', '8.93', '10.15',
       '8.42', '9.01', '10.82', '9.13', '9.08', '9.23', '9.06', '8.23',
       '8.06', '8.69', '5.6', '9.07', '10.4', '8.07', '9.57', '8.51',
       '10.04', '11.23', '7.19', '10.06', '10.19', '12.77', '10.21',
       '16.28', '8.45', '11.33', '8.91', '10.08', '8.82', '10.59', '10.2',
       '6.64', '6.59', '10.62', '9.84', '9.16', '8.35', '9.51', '8.94',
       '8.28', '8.84', '7.78', '11.04', '10.95', '10.23', '13.2', '11.5',
       '8.9', '9.38', '12.67', '9.74', '12.95', '12.91', '13.39', '10.14',
       '14.24', '11.69', '8.87', '10.64', '8.75', '9.63', '11.03',
       '10.09', '10.3', '10.25', '11.63', '9.7', '11.11', '11.16',
       '10.46', '11.35', '9.96', '10.84', '10.74', '10.38', '10.53',
       '11.58', '8.92', '8.89', '9.33', '10.63', '9.77', '11.46', '11.93',


In [74]:
# Creating verbatimMeasurementUnit [currently not accepted by GEOME]
#longVers=longVers.assign(verbatimMeasurementValue = longVers["measurementValue"])

def unit_clean(value, unit):
    """Cleans and converts measurementValue column"""
    if unit == "g":
    # Isolate value, convert from pounds to grams 
        if value == "2400 oz/ 150lbs":
            return 150 * 453.59237
        elif value == "70lbs":
            return 70 * 453.59237
        elif value == "78 lb":
            return 78 * 453.59237
        elif value == "112 lb":
            return 112 * 453.59237
        elif value == "48.2":
            return 48.2 * 453.59237
        else: 
            value = str(value).split()
            return int(value[0]) * 453.59237
    elif unit == "mm":
        return float(value) * 25.4
    
# Clean and convert measurementValue column
long_data['measurementValue'] = long_data.apply(lambda x: unit_clean(x.measurementValue, x.measurementUnit), axis=1)

Assign diagnosticID with unique number

In [42]:
#Create diagnosticID which is a unique number for each measurement
long_data['diagnosticID'] = [uuid.uuid4().hex for _ in range(len(long_data.index))]

Creating eventID with a unqiue uuid

In [None]:
long_data['eventID'] = [uuid.uuid4().hex for _ in range(len(long_data.index))]

Write file as csv for GEOME upload

In [43]:
#Writing long data csv file
long_data.to_csv('../Mapped_Data/FuTRES_Aepyceros_Africa_Modern_May_22_Update.csv', index=False)