Data Wrangling Notebook for Bios Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [126]:
import pandas as pd
import numpy as np
import uuid
import re

Silencing warnings that are unnecessary

In [127]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import data

In [128]:
biogeo = pd.read_csv("../Original_Data/J.Biogeo.2008.AllData.Final.csv")

Add individualID and populate with UUID

In [129]:
biogeo['individualID'] = [uuid.uuid4().hex for _ in range(len(biogeo.index))]

Convert elevation.ft values from feet to meters

In [130]:
biogeo['elevation.ft']=biogeo['elevation.ft'].multiply(0.3048)

Creating a new column called institutionCode and moving from Specimen.Number to institutionCode. 

In [131]:
biogeo["institutionCode"] = biogeo['Specimen.Number'].str.split().str[0]
biogeo["Specimen.Number"] = biogeo['Specimen.Number'].str.split().str[1]

Add required GEOME columms

In [132]:
biogeo=biogeo.assign(basisOfRecord="PreservedSpecimen")
biogeo=biogeo.assign(scientificName="Spermophilus beecheyi")
#biogeo=biogeo.assign(verbatimscientificName=biogeo["ScientificName"])
biogeo=biogeo.assign(country="Unknown")
biogeo=biogeo.assign(locality="Not Collected")
biogeo=biogeo.assign(yearCollected="Unknown")
biogeo=biogeo.assign(samplingProtocol="Not Collected")
biogeo=biogeo.assign(measurementMethod="Unknown")

Create measurementUnit column

In [133]:
biogeo=biogeo.assign(measurementUnit = "")

Add otherCatalogNumbers by combining Proxy.Specimen.Number, Annual.Specimen.Number, and YOC.Specimen.Number 

In [134]:
biogeo=biogeo.assign(otherCatalogNumbers = biogeo['Proxy.Specimen.Number'].fillna('') \
                     +biogeo['Annual.Specimen.Number'].fillna('')+biogeo['YOC.Specimen.Number'].fillna('') )

Select specified columns for final dataset

In [135]:
cols = biogeo.columns.tolist()

cols = ['Specimen.Number',
        'institutionCode',
        'otherCatalogNumbers',
        'individualID',
        'dec.lat',
        'dec.long',  
        'max.error',
        'elevation.ft',
        'hind.foot.length.mm',
        'tail.length.mm',
        'total.length.mm',
        'body.mass.g',
        'ear.length.mm',
        'c.toothrow.1.mm',
        'c.toothrow.2.mm',
        'basisOfRecord',
        'scientificName',
        #verbatimscientificName
        'country',
        'locality',
        'yearCollected',
        'samplingProtocol',
        'measurementMethod',
        'measurementUnit']

biogeo = biogeo[cols]

Matching template and column terms

In [136]:
biogeo = biogeo.rename(columns = {'Specimen.Number':'catalogNumber', 
                                  'dec.lat':'decimalLatitude', 
                                  'dec.long':'decimalLongitude',  
                                  'max.error':'coordinateUncertaintyInMeters', 
                                  'elevation.ft':'pointElevationInMeters'})

Create long version of final data

In [137]:
long_data = pd.melt(biogeo, 
                id_vars=['catalogNumber',
                         'institutionCode',
                         'otherCatalogNumbers',
                         'individualID',
                         'decimalLatitude',
                         'decimalLongitude',  
                         'coordinateUncertaintyInMeters',
                         'pointElevationInMeters',
                         'basisOfRecord',
                         'scientificName',
                         #'verbatimScientificName',
                         'country',
                         'locality',
                         'yearCollected',
                         'samplingProtocol',
                         'measurementMethod',
                         'measurementUnit'], 
                          var_name = 'measurementType', 
                          value_name = 'measurementValue')

Matching trait names to ontology terms

In [138]:
mapping_file = pd.read_csv("./../Mapping Files/ontology_codeBook.csv")

# Create subset of those within FOVT or OBA
map_subset = mapping_file[(mapping_file["Status"] == "in FOVT") | (mapping_file["Status"] == "in OBA") ]

# Create a subset of Blois data
blois_subset = map_subset[map_subset["name"] == "blois"]

# Isolating necessary columns
blois_subset = blois_subset[["bone","label", "term"]]

# Create dictionary of terms
map_dict = map_dict = dict(zip(blois_subset.label , blois_subset.term))

# Map the new terms onto the old terms in the dataframe 
long_data["measurementType"] = long_data["measurementType"].map(map_dict)
long_data = long_data.dropna(subset=['measurementType','measurementValue'])

Adding materialSampleID

In [139]:
# These are the column names that could match materialSampleID if they all match
sample_cols = ['catalogNumber','institutionCode','otherCatalogNumbers',
               'individualID', 'decimalLatitude','decimalLongitude',  
               'coordinateUncertaintyInMeters','pointElevationInMeters',
               'basisOfRecord','scientificName','country','locality',
               'yearCollected','samplingProtocol']

# Dictionary of terms and bones created from ontology codebook
map_dict = dict(zip(blois_subset.term , blois_subset.bone))

# Creating temp_bone column to map the measurementType to the common bone name
long_data["temp_bone"] = long_data["measurementType"].map(map_dict)

# Creating a json column containing everything in sample_cols
long_data['temp_json'] = long_data[sample_cols].apply(lambda x: x.to_json(), axis=1)

# Grouping only if temp_bone and temp_json are the same, assigning numeric ID
long_data["materialSampleID"] = long_data.groupby(["temp_bone", "temp_json"]).ngroup()

# Dropping unnecessary columns
long_data = long_data.drop("temp_bone", axis = 1)
long_data = long_data.drop("temp_json", axis = 1)

# Create GEOME required eventID
long_data["eventID"] = long_data["materialSampleID"]

Populating measurementUnit column with appropriate measurement units in long version

In [140]:
long_body_mass_filter = long_data['measurementType']=="body mass"
long_no_body_filter = long_data['measurementType']!="body mass"

long_data['measurementUnit'][long_body_mass_filter] = "g"
long_data['measurementUnit'][long_no_body_filter] = "mm"

Create diagnosticID which is a unique number for each measurement

In [141]:
long_data['diagnosticID'] = [uuid.uuid4().hex for _ in range(len(long_data.index))]

Delete measurement value columns that contain N/A value

In [142]:
#If measurement value equals N/a, delete entire row
long_data= long_data.dropna(subset=['measurementValue'])

Round coordinateUncertaintyInMeters column to integer value

In [143]:
long_data["coordinateUncertaintyInMeters"] = long_data["coordinateUncertaintyInMeters"].apply \
                                                (lambda x: round(x))

Create verbatimMeasurementUnit column (currently not accepted by GEOME)

In [144]:
#long_data = long_data.assign(verbatimMeasurementUnit = long_data["measurementValue"])

Write file as csv for GEOME upload

In [145]:
long_data.to_csv('../Mapped_Data/FuTRES_Spermophilus.beecheyi_Blois_NorthAmerica_Modern.csv', index = False);