Data Wrangling Notebook for Bernor Equid dataset
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [1]:
import pandas as pd
import numpy as np
import uuid

Silencing warnings that are unnecessary

In [2]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Read original data 

In [3]:
ray_data = pd.read_csv("../Original_Data/ToFuTRESVER_14_1_26_2021_REV_23.csv")
locality_data = pd.read_csv("../Original_Data/LOCAL_1_26_2021FuTRESPROTECTED2.csv")

Standardize 'SEX' column 

In [4]:
female = ray_data['SEX']=="female"
male = ray_data['SEX'] == "male"
ray_data['SEX'][(female == False)&(male==False)]="not collected"

Standardize "SIDE" column

In [5]:
# Clean up Side column 
right = ray_data['SIDE']=="right"
left = ray_data['SIDE'] == "left"
ray_data['SIDE'][(female == False)&(male==False)]="not collected"

Combine genus and species column to create verbatimScientificName column (not currently accepted in GEOME)

In [6]:
# Leave out of final dataframe for now. Use to standardize scientificName
ray_data = ray_data.assign(verbatimScientificName = ray_data['GENUS'] + " " + ray_data['SPECIES'])

Create standardized species column and dynamic properties column

In [11]:
# Create Scientific Name column 
ray_data = ray_data.assign(scientificName = "")

# Create Dynamic Properties Column
ray_data = ray_data.assign(dynamicProperties = "")


def clean_name(name):
    """Converts scientific name to binomial nomenclature format"""
    str_name = str(name).split()
    
    if len(str_name) == 3:        
        if str_name[2] == "L" or str_name[2] == "T" : 
            return " ".join(str_name[:-1])            
            

def type_name(name):
    """Transfer type specimen information to dynamicProperties"""
    name = str(name).split()
    if len(name) == 3:
        if name[2] == "T":
            return "Type Specimen"

# Begin cleaning scientificName
ray_data["scientificName"]  = ray_data["verbatimScientificName"].apply(clean_name) 

# Update dynamicProperties
ray_data["dynamicProperties"]  = ray_data["verbatimScientificName"].apply(type_name)


Create verbatimEventDate column and populate with data from DATE COLLECTED

In [12]:
ray_data = ray_data.assign(verbatimEventDate = ray_data['DATE COLLECTED'])

Create yearCollected column and populate it with isolated year from DATE COLLECTED

In [13]:
ray_data = ray_data.assign(yearCollected = "")

# Filtering out non-date entries
date_filter=ray_data["verbatimEventDate"].str.contains("Lower|Upper|Uppermost|Loc")
ray_data["yearCollected"]=ray_data["verbatimEventDate"][date_filter==False]

# Isolating year
ray_data['yearCollected'] = ray_data.yearCollected.str[-4:]
ray_data['yearCollected'] = ray_data['yearCollected'].fillna("unknown")

Create individualID column and populate it with SPEC_ID

In [14]:
ray_data = ray_data.assign(individualID = ray_data['SPEC_ID'])

Create verbatimAgeValue columns and populate with AGE

In [15]:
ray_data = ray_data.assign(verbatimAgeValue = ray_data['AGE'])

Add Country and Locality by matching to Locality sheet

In [None]:
ray_data = ray_data.assign(country = "")
ray_data = ray_data.assign(verbatimLocality = "")
ray_data = ray_data.assign(ray_temp_locality = "")

ray_data["COUNTRY"] = ray_data["COUNTRY"].apply(str)

ray_data['ray_temp_locality'] = ray_data['COUNTRY']+" "+ ray_data['LOCALITY']
locality_data['local_temp_locality'] = locality_data['COUNTRY No'] + " " + locality_data["LOCALITY No."]

for i in ray_data.index:
    for j in locality_data.index:
        if ray_data['ray_temp_locality'][i] == locality_data['local_temp_locality'][j]:
            ray_data["country"][i] = locality_data["COUNTRYName"][j]
            ray_data["verbatimLocality"][i] = locality_data["LOCALITYName"][j]
        else:
            ray_data["country"][i] = "Unknown"

            
ray_data['country']=ray_data['country'].replace({'Tibetan Plateau, Nepal':'Nepal'}) 

ray_data=ray_data.assign(locality= ray_data["verbatimLocality"]).fillna("Unknown")


Select specified columns for final dataset

In [95]:
# Create column list
cols = ray_data.columns.tolist()

# Specify desired columns
cols = ['individualID',
        'SEX',
        'SIDE',
        'scientificName',
        'dynamicProperties',
        'verbatimEventDate',
        'yearCollected',
        'verbatimAgeValue',
        'minimumChronometricAge',
        'maximumChronometricAge',
        'minimumChronometricAgeReferenceSystem',
        'maximumChronometricAgeReferenceSystem',
        'verbatimLocality',
        'locality',
        'country',
        'BONE',
        'M1',
        'M2',
        'M3',
        'M4',
        'M5',
        'M6',
        'M7',
        'M8',
        'M9',
        'M10',
        'M11',
        'M12',
        'M13',
        'M14',
        'M15',
        'M16',
        ' M17',
        'M18',
        'M19',
        'M20',
        'M21',
        'M22',
        'M23',
        'M24',
        'M25',
        'M26',
        'M27',
        'M28',
        'M29',
        'M30',
        'M31',
        'M32',
        'M33',
        'M34',
        'M35',
        'M36',
        'M37',
        'M38']

# Subset dataframe
ray_data = ray_data[cols]

Matching template and column terms

In [96]:
# Renaming columns 
ray_data = ray_data.rename(columns = {'SEX':'sex',
                                      'SIDE':'side'})

Create measurementUnit column (switched to mm, conversion will occur later)

In [97]:
ray_data = ray_data.assign(measurementUnit = "mm")

Create basisofRecord column and populate

In [98]:
ray_data  =ray_data.assign(basisOfRecord = "FossilSpecimen")

Fill in blanks for required columns 

In [99]:
ray_data=ray_data.assign(samplingProtocol="Unknown")
ray_data=ray_data.assign(measurementMethod="Unknown")

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). Create necessary eventID column and populate with materialSampleID column

In [100]:
ray_data = ray_data.assign(materialSampleID = '')
ray_data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(ray_data.index))]

ray_data = ray_data.assign(eventID = ray_data["materialSampleID"])

Create a long version of the data frame

In [101]:
longVers=pd.melt(ray_data, 
                id_vars=['individualID',
                         'sex',
                         'side',
                         'scientificName',
                         'dynamicProperties',
                         'verbatimEventDate',
                         'yearCollected',
                         'verbatimAgeValue',
                         'minimumChronometricAge',
                         'maximumChronometricAge',
                         'minimumChronometricAgeReferenceSystem',
                         'maximumChronometricAgeReferenceSystem',
                         'verbatimLocality',
                         'locality',
                         'country',
                         'measurementUnit',
                         'basisOfRecord',
                         'samplingProtocol',
                         'measurementMethod',
                         'materialSampleID',
                         'eventID',
                         'BONE'], 
                var_name = 'measurementType', 
                value_name = 'measurementValue')

Unnamed: 0,individualID,sex,side,scientificName,verbatimEventDate,yearCollected,verbatimAgeValue,minimumChronometricAge,maximumChronometricAge,minimumChronometricAgeReferenceSystem,...,country,measurementUnit,basisOfRecord,samplingProtocol,measurementMethod,materialSampleID,eventID,BONE,measurementType,measurementValue
0,SMNS1508,not collected,not collected,Equus quagga,Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,e8268688_7317_4c38_a561_414e03d5fd8d,e8268688_7317_4c38_a561_414e03d5fd8d,a1ph3,M1,77.2
1,SMNS7335,male,unknown,Equus quagga,Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,8cee7142_036d_4b52_80cc_5be52c76697a,8cee7142_036d_4b52_80cc_5be52c76697a,a2ph3,M1,39.4
2,SMNS1508,not collected,not collected,Equus quagga,Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,59ed168b_2be5_44ed_b607_7f34b2b362d0,59ed168b_2be5_44ed_b607_7f34b2b362d0,astragalus,M1,55.7
3,SMNS1508,not collected,not collected,Equus quagga,Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,6608088a_645d_4319_bbef_6b176ffcb720,6608088a_645d_4319_bbef_6b176ffcb720,calcaneum,M1,101.9
4,SMNS7335,male,unknown,Equus quagga,Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,953f4264_be2e_4d77_8cc0_5c25781d4dda,953f4264_be2e_4d77_8cc0_5c25781d4dda,femur,M1,357.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807267,USNM541427,female,left,Equus hemionus,10-Jul-1981,1981,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,fed0c549_4bca_46a1_9123_3378e8cf4206,fed0c549_4bca_46a1_9123_3378e8cf4206,txP4,M38,Unknown
807268,MSNAF2842,not collected,not collected,"""Hipparion"" sp.",Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,7ea0896f_7dfb_42f8_88eb_4b9fd31d9062,7ea0896f_7dfb_42f8_88eb_4b9fd31d9062,txP3,M38,Unknown
807269,MSNAF2842,not collected,not collected,"""Hipparion"" sp.",Unknown,unknown,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,71443f16_90d1_49b0_af1f_c5eec651a72b,71443f16_90d1_49b0_af1f_c5eec651a72b,txP4,M38,Unknown
807270,USNM241009,male,left,Equus grevyi,9-Oct-1925,1925,0.0,,,mya,...,Unknown,in,FossilSpecimen,Unknown,Unknown,dba7c312_5b8a_4762_bac3_27f7b64cd4e3,dba7c312_5b8a_4762_bac3_27f7b64cd4e3,tml1,M38,Unknown


Create new measurementType column by combining BONE and measurementType column

In [102]:
longVers['measurementType'] = longVers['BONE'] + longVers['measurementType']

0              a1ph3M1
1              a2ph3M1
2         astragalusM1
3          calcaneumM1
4              femurM1
              ...     
807267         txP4M38
807268         txP3M38
807269         txP4M38
807270         tml1M38
807271      UnknownM38
Name: measurementType, Length: 807272, dtype: object

Remove unnecessary BONE columnn

In [103]:
del longVers['BONE']

Filter out the GEOME measurements

In [104]:
correct_element_filter = longVers["measurementType"].str.match("femurM1|femurM2|humerusM1|humerusM2|astragalusM3|astraglausM2|astragalusM7|astragalusM1|astragalusM4|calcaneumM1|calcaneumM6")
incorrect_filter = longVers["measurementType"].str.contains("M10|M11|M12|M13|M14|M15|M16|M17|M18|M19|M20|M21|M22|M23|M24|M25|M26|M27|M28|M29|M30|M31|M32|M33|M34|M35|M36|M37|M38")
longVers=longVers[correct_element_filter==True][incorrect_filter==False]

  longVers=longVers[correct_element_filter==True][incorrect_filter==False]


Renaming measurementType values

In [105]:
longVers['measurementType']=longVers['measurementType'].replace({'femurM1': 'femur length to greater trochanter', 
                                                                 'femurM2': 'femur length to head of femur', 
                                                                 'humerusM1': 'humerus length to ventral tubercle', 
                                                                 'humerusM2': 'humerus length to caput of humerus',
                                                                 'astragalusM3':'breadth of the trochlea of talus',
                                                                 'astragalusM4':'talus breadth',
                                                                 'astraglausM2':'length of medial trochlea of talus',
                                                                 'astragalusM7':'depth of the medial side of talus',
                                                                 'astragalusM1':'astragalus length',
                                                                 'calcaneumM1' : 'calcaneus length',
                                                                 'calcaneumM6' : 'calcaneus breadth'
                                                                 #'astragalusM1':'length of the medial side of talus'
                                                                 #'astragalusM5':'distal articular breadth of talus',
                                                                 #'astragalusM6':'distal articular depth of talus'
                                                                 })


Create diagnosticID which is a unique number for each measurement

In [106]:
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

Fix up measurementValue entrie

In [107]:
zero_values=longVers["measurementValue"]=="0"
blanks = longVers["measurementValue"] ==""
unknown = longVers["measurementValue"] == "Unknown"
locality_blanks = longVers["locality"]==""

longVers["measurementValue"]=longVers["measurementValue"][zero_values==False][blanks==False][unknown == False]
longVers["locality"][locality_blanks]="unknown"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  longVers["locality"][locality_blanks]="unknown"


Convert measurementValue from in to mm

In [None]:
def convert(value)
    """Converts value from in to mm"""
    return value * 25.4

longVers["measurementValue"]  = longVers["measurementValue"].apply(convert)

If measurement value equals N/a, delete entire row

In [108]:
longVers = longVers.dropna(subset=['measurementValue'])

Writing long data csv file

In [109]:
longVers.to_csv('../Mapped_Data/FuTRES_Equid_Bernor_Global_Cenozoic.csv', index = False);