Data Wrangling Notebook for VertNet Mammal Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [87]:
import pandas as pd
import numpy as np
import multiprocessing
import re
import uuid 

Silencing warnings that are unnecessary

In [88]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Vertnet Mammal Data

In [89]:
data = pd.read_csv("./../Original_Data/all_mammals_2021-11-09a/all_mammals_2021-11-09a.csv")

len(data)

645609

Add individualID and populate with UUID

In [90]:
data = data.assign(individualID = '')
data['individualID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Clean yearCollected column

In [91]:
# Filling N/As with "Unknown"
data["eventdate"] = data["eventdate"].fillna("Unknown")

# Create yearCollected Column
data = data.assign(yearCollected = '')

# Creating event date variable
verbatim_date = data['eventdate']

# Establishing vertnet filter
vertnet_date_filter = verbatim_date.str.contains("""IV|0000|September|<|NW|latter|unknown|(MCZ)|(MSU)|present|
                                                    and|;|&|mainly|between|Between|BETWEEN|OR|Unknown|UNKNOWN|
                                                    #|TO|\?|\'|----|19--|No Date|\,|\d{4}-\d{4}|(/n) /d|\d{4}[s]|
                                                    \d{4}\'[S]|1075-07-29|975-07-17|2088|9999|0201|1197|
                                                    1260|4560|1024|1119|1192|1072|1186|2364""")

# Grabbing clean data
verbatim_date_clean= verbatim_date[vertnet_date_filter==False]


# Captures year within string
def year_search(year):
    """Search string for 4 digit number and pass to correct function"""
    if (re.search(r'\d{4}$', year)):
        return year_cleaner_front(year)
    elif (re.search(r'^\d{4}', year)):
        return year_cleaner_back(year)

def year_cleaner_front(year):
    """Isolate the year at the beginning of the string"""
    cleaned_year = year[len(year)-4:len(year)]
    return cleaned_year

def year_cleaner_back(year):
    """Isolate the year at the end of the string"""
    cleaned_year = year[0:4]
    return cleaned_year

data["yearCollected"] = verbatim_date_clean.apply(year_search)
data["yearCollected"] = data["yearCollected"].fillna("Unknown")

Clean up lifeStage

In [92]:
# Fill in NA
data["lifestage_cor"] = data['lifestage_cor'].fillna("Not Collected")

# Create Filters
adult = data['lifestage_cor'] == "Adult"
juvenile = data['lifestage_cor'] == "Juvenile"
ns = data['lifestage_cor'] == "NS"

# Assign correct terms using filters
data['lifestage_cor'][adult] = "adult"
data['lifestage_cor'][juvenile] = "juvenile"
data['lifestage_cor'][ns] = "Not Collected"

Clean up sex column

In [93]:
# Clean up sex column 
female = data['sex'] == "female"
male = data['sex'] == "male"
data['sex'][(female == False) & (male == False)] = ""

Fill Scientific Names with unknown 

In [94]:
data["binomial"] = data["binomial"].fillna("Unknown")

Adding additional required GEOME columns

In [95]:
data = data.assign(samplingProtocol = "Unknown")
data = data.assign(basisOfRecord = "PreservedSpecimen")

Create verbatimEventDate column

In [96]:
data = data.assign(verbatimEventDate = '')
data['verbatimEventDate'] = data['verbatimeventdate']

Clean up country column [obsolete in new data version]

In [97]:
# Append countries to verbatim locality column
data["verbatimLocality"] = data["locality"] + "," + data["country"]

# Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

country_dictionary = {"United States":"USA", "U S A":"USA", 
                      "Philippine Islands":"Philippines",
                      "Indonesia; Borneo":"Indonesia",
                      "Malaysia; Malaya":"Malaysia",
                      "U.S. Virgin Islands":"Virgin Islands",
                      "Republic of South Africa":"South Africa",
                      "Ivory Coast":"Cote d'Ivoire",
                      "Federated States of Micronesia":"Micronesia",
                      "Lesser Antilles; Grenada":"Grenada",
                      "Indonesia; Java":"Indonesia",
                      "Lesser Antilles; Saint Vincent":"Saint Vincent and the Grenadines",
                      "Lesser Antilles; Barbados":"Barbados",
                      "ST VINCENT":"Saint Vincent and the Grenadines",
                      "Lesser Antilles; Montserrat": "Montserrat",
                      "Indonesia; Sumatra":"Indonesia",
                      "Virgin Islands, US":"Virgin Islands",
                      "Lesser Antilles; Antigua":"Antigua and Barbuda",
                      "England":"United Kingdom",
                      "Republic of Trinidad and Tobago":"Trinidad and Tobago",
                      "Trinidad And Tobago; Trinidad":"Trinidad and Tobago",
                      "COMMONWEALTH OF THE NORTHERN MARIANA ISLANDS":"Northern Mariana Islands",
                      "Congo":"Democratic Republic of the Congo",
                      "Malaysia; Sabah":"Malaysia",
                      "Lesser Antilles; Martinique":"Martinique",
                      "Republic of the Marshall Islands":"Marshall Islands",
                      "Commonwealth of the Bahamas":"Bahamas",
                      "Trinidad & Tabago":"Trinidad and Tobago",
                      "United Kingdom; England":"United Kingdom",
                      "United Kingdom; Scotland":"United Kingdom",
                      "United Kingdom; Wales":"United Kingdom",
                      "Lesser Antilles; Dominica":"Dominica",
                      "Papua, New Guinea":"Papua New Guinea",
                      "People's Republic of China":"China",
                      "SCOTLAND":"United Kingdom"}

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

data['country'] = data['country'].apply(country_correction)

Create verbatimElevation Column

In [98]:
string_max = data["maximumelevationinmeters"].astype(str)
string_min = data["minimumelevationinmeters"].astype(str)
data['verbatimElevation'] = string_max + "," + string_min

Rearrange columns so that template columns are first, followed by measurement values

In [99]:
# Create column list
cols = data.columns.tolist()

# Specify desired columns
cols = ['catalognumber',
        'collectioncode',
        'decimallatitude',
        'individualID',
        'decimallongitude',
        'verbatimElevation',
        'maximumelevationinmeters',
        'minimumelevationinmeters',
        'institutioncode',
        'verbatimEventDate',
        'occurrenceremarks',
        'occurrenceid',
        'verbatimlongitude',
        'verbatimlatitude',
        'verbatimLocality',
        'samplingProtocol',
        'sex',
        'country',
        'lifestage_cor',
        'binomial',
        'basisOfRecord',
        'yearCollected',
        'body_mass.value',
        'body_mass.units',
        'ear_length.value',
        'ear_length.units',
        'hind_foot_length.value',
        'hind_foot_length.units',
        'tail_length.value',
        'tail_length.units',
        'total_length.value', 
        'total_length.units',
        'body_mass.units_inferred',
        'ear_length.units_inferred',
        'hind_foot_length.units_inferred',
        'tail_length.units_inferred',
        'total_length.units_inferred',
        'body_mass.estimated_value',
        'ear_length.estimated_value',
        'hind_foot_length.estimated_value',
        'tail_length.estimated_value',
        'total_length.estimated_value']

# Subset dataframe
data = data[cols]

Matching template and column terms

In [100]:
# Renaming columns 
data = data.rename(columns = {'catalognumber': 'catalogNumber',
                              'collectioncode':'collectionCode',
                              'decimallatitude':'decimalLatitude',
                              'decimallongitude':'decimalLongitude',
                              'institutioncode' :'institutionCode',
                              'occurrenceremarks':'occurrenceRemarks',
                              'maximumelevationinmeters':'maximumElevationInMeters',
                              'minimumelevationinmeters':'minimumElevationInMeters',
                              'occurrenceid':'occurrenceID',
                              'verbatimlongitude':'verbatimLongitude',
                              'verbatimlatitude':'verbatimLatitude',
                              'lifestage_cor':'lifeStage',
                              'binomial':'scientificName'})

Create materialSampleID which is a UUID for each measurement

In [101]:
data = data.assign(materialSampleID = '')
data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(data.index))]

Create eventID and populate it with materialSampleID

In [102]:
data = data.assign(eventID = data["materialSampleID"])

Add required GEOME column locality after reassigning locality to verbatimLocality

In [103]:
data = data.assign(locality="Unknown")

Creates a unique measurementMethod column for each desired trait

In [104]:
# List of desired traits
trait_name_list = ["body_mass","ear_length","hind_foot_length",
                   "tail_length","total_length"]

method_list = ["measurementMethod_" + x for x in trait_name_list]
data = data.join(pd.DataFrame(index = data.index, columns= method_list))

def trait_method(trait):
    """
    Adds measurementMethod information based off of "True" values in inferred value
    and estimated value columns
    """
    
    column = "measurementMethod_" + trait
    
    inferred_column = trait + ".units_inferred"
    estimated_column = trait + ".estimated_value"
    
    inferred_filter = data[inferred_column].astype(str).str.contains("TRUE|True|true")
    estimated_filter = data[estimated_column].astype(str).str.contains("TRUE|True|true")
    
    data[column][inferred_filter] = "Extracted with Traiter ; inferred value"
    data[column][estimated_filter] = "Extracted with Traiter ; estimated value"
    data[column][estimated_filter & inferred_filter] = "Extracted with Traiter ; estimated value; inferred value"

[trait_method(x) for x in trait_name_list]

data = data.drop(columns = ['body_mass.units_inferred',
                'ear_length.units_inferred',
                'hind_foot_length.units_inferred',
                'tail_length.units_inferred',
                'total_length.units_inferred',
                'body_mass.estimated_value',
                'ear_length.estimated_value',
                'hind_foot_length.estimated_value',
                'tail_length.estimated_value',
                'total_length.estimated_value'])


Add filler to units column

In [105]:
data["body_mass.units"]= data["body_mass.units"].fillna("unknown")
data["ear_length.units"] = data["ear_length.units"].fillna("unknown")
data["hind_foot_length.units"] = data["hind_foot_length.units"].fillna("unknown")
data["tail_length.units"] = data["tail_length.units"].fillna("unknown")
data["total_length.units"] = data["total_length.units"].fillna("unknown")

data["body_mass.value"] = data["body_mass.value"].fillna("unknown")
data["ear_length.value"] = data["ear_length.value"].fillna("unknown")
data["hind_foot_length.value"] = data["hind_foot_length.value"].fillna("unknown")
data["tail_length.value"] =  data["tail_length.value"].fillna("unknown")
data["total_length.value"] = data["total_length.value"].fillna("unknown")

In [106]:
data["body_mass_temp"] = data["body_mass.value"].astype(str) +" ; "+ data["body_mass.units"]
data["ear_length_temp"] = data["ear_length.value"].astype(str) + " ; "+data["ear_length.units"]
data["hind_foot_length_temp"] = data["hind_foot_length.value"].astype(str) + " ; " + data["hind_foot_length.units"]
data["tail_length_temp"] = data["tail_length.value"].astype(str) + " ; " + data["tail_length.units"]
data["total_length_temp"] = data["total_length.value"].astype(str) + " ; " + data["total_length.units"]

data = data.drop(columns = ['body_mass.value',
                'ear_length.value',
                'hind_foot_length.value',
                'tail_length.value',
                'total_length.value',
                'body_mass.units',
                'ear_length.units',
                'hind_foot_length.units',
                'tail_length.units',
                'total_length.units'])

Creating long version, first specifiying keep variables, then naming type and value

In [107]:
melt_cols = ['catalogNumber', 'collectionCode', 'decimalLatitude','decimalLongitude',
            'verbatimElevation','yearCollected','basisOfRecord','verbatimEventDate',
            'institutionCode','lifeStage','verbatimLocality','locality', 'individualID',
            'samplingProtocol','sex','scientificName', 'occurrenceRemarks','country',
            'occurrenceID', 'verbatimLongitude', 'verbatimLatitude','materialSampleID','eventID',
            'maximumElevationInMeters', 'minimumElevationInMeters',]

melt_cols = melt_cols + method_list

longVers  = pd.melt(data,id_vars = melt_cols, var_name = 'measurementType', value_name = 'measurementValue')

Pull corresponding column value in measurement_method etc and append it to offical measurementMethod

In [108]:
longVers = longVers.assign(measurementMethod = "")

def method_add(trait,ind):
    if trait == "body_mass_temp":
        return longVers["measurementMethod_body_mass"][ind]
    elif trait == "ear_length_temp":
        return longVers["measurementMethod_ear_length"][ind]
    elif trait == "hind_foot_length_temp":
        return longVers["measurementMethod_hind_foot_length"][ind]
    elif trait == "tail_length_temp":
        return longVers["measurementMethod_tail_length"][ind]
    elif trait == "total_length_temp":
        return longVers["measurementMethod_total_length"][ind]

longVers['ind'] = np.arange(len(longVers))

longVers['measurementMethod'] = longVers.apply(lambda x: method_add(x.measurementType, x.ind), axis=1)

longVers['measurementMethod'] = longVers['measurementMethod'].fillna("Extracted with Traiter")

Drop unnecessary columns

In [110]:
longVers = longVers.drop(columns = method_list)
longVers= longVers.drop(columns = 'ind')

Matching trait and ontology terms

In [111]:
# Create trait dictionary 
trait_dict = {'body_mass_temp':'body mass',
              'ear_length_temp': 'ear length to notch',
              'hind_foot_length_temp':'pes length',
              'tail_length_temp':'tail length',
              'total_length_temp':'body length'}

def trait_rename(trait): 
    """
    Renames trait names with trait dictionary
    """
    if trait in trait_dict.keys():
        return trait_dict[trait]

longVers['measurementType'] = longVers['measurementType'].apply(trait_rename)

Creating verbatimMeasurementUnit column

In [114]:
longVers = longVers.assign(verbatimMeasurementUnit = "")
longVers[['measurementValue', 'verbatimMeasurementUnit']] = longVers['measurementValue'].str.split(';', expand=True)

Populating measurementUnit column with appropriate measurement units in long version

In [116]:
# Create measurementUnit column
longVers=longVers.assign(measurementUnit="")

#Create filters
long_body_mass_filter=longVers['measurementType']=="body mass"
long_no_body_filter=longVers['measurementType']!="body mass"

#Assign units using filters
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"


Create diagnosticID which is a unique number for each measurement

In [117]:
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

In [118]:
longVers.to_csv('Peromyscus_maniculatus_VertNet_subset_pre_NA_drop.csv', index=False)

If measurement value equals N/A, delete entire row. Drop range values. 

In [119]:
#Drop N/A
longVers["verbatimMeasurementUnit"] = longVers["verbatimMeasurementUnit"].replace({"unknown":""})

#Drop Range Values and unknowns
range_value_filter=longVers['measurementValue'].str.contains(",|one|unknown", na=False)
longVers['measurementValue'][range_value_filter] = float("nan")
longVers = longVers.dropna(subset=['measurementValue'])

Breaking up the data into more managable sizes for validation and DE storage

In [123]:
# Create chunks list
chunks = []

# Separating files into chunks
chunks = np.array_split(longVers, 13)

Creating data chunks

In [124]:
for i in range(len(chunks)):
    new=i+1
    chunks[i].to_csv('../Mapped_Data/FuTRES_Mammals_VertNet_Global_Modern_'+ str(new) +'.csv', index=False)
    print("mapped_data",i, " done")

mapped_data 0  done
mapped_data 1  done
mapped_data 2  done
mapped_data 3  done
mapped_data 4  done
mapped_data 5  done
mapped_data 6  done
mapped_data 7  done
mapped_data 8  done
mapped_data 9  done
mapped_data 10  done
mapped_data 11  done
mapped_data 12  done
