Data Wrangling Notebook for Bernor Equid dataset
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [1]:
import pandas as pd
import numpy as np
import uuid

Silencing warnings that are unnecessary

In [2]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Read original data 

In [3]:
ray_data = pd.read_csv("../Original_Data/ToFuTRESVER_14_1_26_2021_REV_23.csv")
locality_data = pd.read_csv("../Original_Data/LOCAL_1_26_2021FuTRESPROTECTED2.csv")

Standardize 'SEX' column 

In [4]:
female = ray_data['SEX']=="female"
male = ray_data['SEX'] == "male"
ray_data['SEX'][(female == False)&(male==False)]="not collected"

Standardize "SIDE" column

In [5]:
# Clean up Side column 
right = ray_data['SIDE']=="right"
left = ray_data['SIDE'] == "left"
norecord = ray_data['SIDE'] == "norecord"
ray_data['SIDE'][(female == False)&(male==False)(norecord == True)]= ""

Combine genus and species column to create verbatimScientificName column (not currently accepted in GEOME)

In [6]:
# Leave out of final dataframe for now. Use to standardize scientificName
ray_data = ray_data.assign(verbatimScientificName = ray_data['GENUS'] + " " + ray_data['SPECIES'])

Create standardized species column and dynamic properties column

In [7]:
# Create Scientific Name column 
ray_data = ray_data.assign(scientificName = "")

# Create Dynamic Properties Column
ray_data = ray_data.assign(dynamicProperties = "")

def type_name(name):
    """Transfer type specimen information to dynamicProperties"""
    name = str(name).split()
    if len(name) == 3:
        if name[2] == "T":
            return "Type Specimen"
        else:
            return ""

# Begin cleaning scientificName
#ray_data["scientificName"]  = ray_data["verbatimScientificName"].apply(clean_name) 
ray_data["scientificName"]  = ray_data["verbatimScientificName"]

# Update dynamicProperties
ray_data["dynamicProperties"]  = ray_data["verbatimScientificName"].apply(type_name)


Create verbatimEventDate column and populate with data from DATE COLLECTED

In [8]:
ray_data = ray_data.assign(verbatimEventDate = ray_data['DATE COLLECTED'])

Create yearCollected column and populate it with isolated year from DATE COLLECTED

In [9]:
ray_data = ray_data.assign(yearCollected = "")

# Filtering out non-date entries
date_filter=ray_data["verbatimEventDate"].str.contains("Lower|Upper|Uppermost|Loc")
ray_data["yearCollected"]=ray_data["verbatimEventDate"][date_filter==False]

# Isolating year
ray_data['yearCollected'] = ray_data.yearCollected.str[-4:]
ray_data['yearCollected'] = ray_data['yearCollected'].fillna("unknown")

Create individualID column and populate it with SPEC_ID

In [10]:
ray_data = ray_data.assign(individualID = ray_data['SPEC_ID'])

Create verbatimAgeValue columns and populate with AGE

In [11]:
ray_data = ray_data.assign(verbatimAgeValue = ray_data['AGE'])

Create minimumChronometricAge and maximumChronometricAge to handle age ranges

In [None]:
ray_data = ray_data.assign(minimumChronometricAge = "")
ray_data = ray_data.assign(maximumChronometricAge = "")

def define_min(age):
    """Finds the minimum of a given range"""
    age = str(age).split("-")
    if age[0] > age[1]:
        return age[1]
    else:
        return age[0]

def define_max(age):
    """Finds the maximum of a given range"""
    age = str(age).split("-")
    if age[0] < age[1]:
        return age[1]
    else:
        return age[0]
    

ray_data['minimumChronometricAge'] = ray_data["verbatimAgeValue"].apply(define_min)
ray_data['maximumChronometricAge'] = ray_data["verbatimAgeValue"].apply(define_max)

Create minimumChronometricAgeReferenceSystem and maximumChronometricAgeReferenceSystem and populate with mya

In [30]:
ray_data = ray_data.assign(minimumChronometricAgeReferenceSystem = "mya")
ray_data = ray_data.assign(maximumChronometricAgeReferenceSystem = "mya")

Add Country and Locality by matching to Locality sheet

In [12]:
ray_data = ray_data.assign(country = "")
ray_data = ray_data.assign(verbatimLocality = "")
ray_data = ray_data.assign(ray_temp_locality = "")

ray_data["COUNTRY"] = ray_data["COUNTRY"].apply(str)

ray_data['ray_temp_locality'] = ray_data['COUNTRY']+" "+ ray_data['LOCALITY']
locality_data['local_temp_locality'] = locality_data['COUNTRY No'] + " " + locality_data["LOCALITY No."]

for i in ray_data.index:
    for j in locality_data.index:
        if ray_data['ray_temp_locality'][i] == locality_data['local_temp_locality'][j]:
            ray_data["country"][i] = locality_data["COUNTRYName"][j]
            ray_data["verbatimLocality"][i] = locality_data["LOCALITYName"][j]
        else:
            ray_data["country"][i] = "Unknown"

            
ray_data['country']=ray_data['country'].replace({'Tibetan Plateau, Nepal':'Nepal'}) 

ray_data=ray_data.assign(locality= ray_data["verbatimLocality"]).fillna("Unknown")


Select specified columns for final dataset

In [14]:
# Create column list
cols = ray_data.columns.tolist()

# Specify desired columns
cols = ['individualID',
        'SEX',
        'SIDE',
        'scientificName',
        'dynamicProperties',
        'verbatimEventDate',
        'yearCollected',
        'verbatimAgeValue',
        'verbatimLocality',
        'locality',
        'country',
        'BONE',
        'M1',
        'M2',
        'M3',
        'M4',
        'M5',
        'M6',
        'M7',
        'M8',
        'M9',
        'M10',
        'M11',
        'M12',
        'M13',
        'M14',
        'M15',
        'M16',
        ' M17',
        'M18',
        'M19',
        'M20',
        'M21',
        'M22',
        'M23',
        'M24',
        'M25',
        'M26',
        'M27',
        'M28',
        'M29',
        'M30',
        'M31',
        'M32',
        'M33',
        'M34',
        'M35',
        'M36',
        'M37',
        'M38']

# Subset dataframe
ray_data = ray_data[cols]

Matching template and column terms

In [15]:
# Renaming columns 
ray_data = ray_data.rename(columns = {'SEX':'sex',
                                      'SIDE':'side'})

Create measurementUnit column (switched to mm, conversion will occur later)

In [16]:
ray_data = ray_data.assign(measurementUnit = "mm")

Create basisofRecord column and populate

In [17]:
ray_data  =ray_data.assign(basisOfRecord = "FossilSpecimen")

Fill in blanks for required columns 

In [18]:
ray_data=ray_data.assign(samplingProtocol="Unknown")
ray_data=ray_data.assign(measurementMethod="Unknown")

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). Create necessary eventID column and populate with materialSampleID column

In [19]:
ray_data = ray_data.assign(materialSampleID = '')
ray_data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(ray_data.index))]

ray_data = ray_data.assign(eventID = ray_data["materialSampleID"])

Create a long version of the data frame

In [21]:
longVers=pd.melt(ray_data, 
                id_vars=['individualID',
                         'sex',
                         'side',
                         'scientificName',
                         'dynamicProperties',
                         'verbatimEventDate',
                         'yearCollected',
                         'verbatimAgeValue',
                         'verbatimLocality',
                         'locality',
                         'country',
                         'measurementUnit',
                         'basisOfRecord',
                         'samplingProtocol',
                         'measurementMethod',
                         'materialSampleID',
                         'eventID',
                         'BONE'], 
                var_name = 'measurementType', 
                value_name = 'measurementValue')

Create new measurementType column by combining BONE and measurementType column

In [22]:
longVers['measurementType'] = longVers['BONE'] + longVers['measurementType']

Remove unnecessary BONE columnn

In [23]:
del longVers['BONE']

Filter out the GEOME measurements

In [24]:
correct_element_filter = longVers["measurementType"].str.match("femurM1|femurM2|humerusM1|humerusM2|calcaneumM1|calcaneumM6|astragalusM3|astragalusM7")
incorrect_filter = longVers["measurementType"].str.contains("M10|M11|M12|M13|M14|M15|M16|M17|M18|M19|M20|M21|M22|M23|M24|M25|M26|M27|M28|M29|M30|M31|M32|M33|M34|M35|M36|M37|M38")
longVers=longVers[correct_element_filter==True][incorrect_filter==False]

Renaming measurementType values

In [25]:
longVers['measurementType']=longVers['measurementType'].replace({'femurM1': 'femur length to greater trochanter', 
                                                                 'femurM2': 'femur length to head of femur', 
                                                                 'humerusM1': 'humerus length to ventral tubercle', 
                                                                 'humerusM2': 'humerus length to caput of humerus',
                                                                 'astragalusM3':'breadth of the trochlea of talus',
                                                                 #'astragalusM4':'talus breadth',
                                                                 #'astraglausM2':'length of medial trochlea of talus',
                                                                 'astragalusM7':'depth of the medial side of talus',
                                                                 #'astragalusM1':'astragalus length',
                                                                 'calcaneumM1' : 'calcaneus length',
                                                                 'calcaneumM6' : 'calcaneus breadth'
                                                                 #'astragalusM1':'length of the medial side of talus'
                                                                 #'astragalusM5':'distal articular breadth of talus',
                                                                 #'astragalusM6':'distal articular depth of talus'
                                                                 })


Create diagnosticID which is a unique number for each measurement

In [26]:
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

Fix up measurementValue entrie

In [27]:
zero_values=longVers["measurementValue"]=="0"
blanks = longVers["measurementValue"] ==""
unknown = longVers["measurementValue"] == "Unknown"
locality_blanks = longVers["locality"]==""

longVers["measurementValue"]=longVers["measurementValue"][zero_values==False][blanks==False][unknown == False]
longVers["locality"][locality_blanks]="unknown"

If measurement value equals N/a, delete entire row

In [28]:
longVers = longVers.dropna(subset=['measurementValue'])

Writing long data csv file

In [29]:
longVers.to_csv('../Mapped_Data/FuTRES_Equid_Bernor_Global_Cenozoic.csv', index = False);