Data Wrangling Notebook for VertNet Bats Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

In [2]:
import pandas as pd
import numpy as np
import multiprocessing
import re
import uuid 

Silencing warnings that are unnecessary

In [3]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

Import Vertnet Bats Data

In [5]:
bats_df = pd.read_csv("./../Original_Data/bats_2020-08-11b.csv")

Assign binomal to scientificName and fill N/As with unknown

In [7]:
bats_df["binomial"] = bats_df["binomial"].fillna("Unknown")

Clean up country column 

In [11]:
#Append countries to verbatim locality column
bats_df["verbatimLocality"] = bats_df["locality"] + "," + bats_df["country"]

#Read GEOME country list
geome_countries = pd.read_csv("./../Mapping Files/geome_country_list.csv")

# Create dictionary after inital GEOME run (create csv file first as a dictionary)
#bats_df["country"].unique()

def country_correction(country): 
    """Corrects country column to geome specific country list"""
    if country in geome_countries.values:
        return country
    elif country in country_dictionary.keys():
        return country_dictionary[country]
    else:
        country = "Unknown"
        return country 

#bats_df['country'] = bats_df['country'].apply(country_correction)

Clean yearCollected column (TODO:Needs to be standardized first)

In [None]:
# Filling N/As with "Unknown"
bats_df["eventdate"] = bats_df["eventdate"].fillna("Unknown")

# Creating event date variable
verbatim_date = bats_df['eventdate']

# Establishing vertnet filter TODO:see how relevant this filter is to this set
vertnet_date_filter = verbatim_date.str.contains("""IV|0000|September|<|NW|latter|unknown|(MCZ)|(MSU)|present|
                                                    and|;|&|mainly|between|Between|BETWEEN|OR|Unknown|UNKNOWN|
                                                    #|TO|\?|\'|----|19--|No Date|\,|\d{4}-\d{4}|(/n) /d|\d{4}[s]|
                                                    \d{4}\'[S]|1075-07-29|975-07-17|2088|9999|0201|1197|
                                                    1260|4560|1024|1119|1192|1072|1186""")

# Grabbing clean data
verbatim_date_clean= verbatim_date[vertnet_date_filter==False]


# Captures year within string
def year_search(year):
    """Search string for 4 digit number and pass to correct function"""
    if (re.search(r'\d{4}$', year)):
        return year_cleaner_front(year)
    elif (re.search(r'^\d{4}', year)):
        return year_cleaner_back(year)

def year_cleaner_front(year):
    """Isolate the year at the beginning of the string"""
    cleaned_year = year[len(year)-4:len(year)]
    return cleaned_year

def year_cleaner_back(year):
    """Isolate the year at the end of the string"""
    cleaned_year = year[0:4]
    return cleaned_year

bats_df = bats_df.assign(yearCollected = verbatim_date_clean.apply(year_search))


bats_df["yearCollected"] = bats_df["yearCollected"].fillna("Unknown")

Clean up lifestage column

In [12]:
bats_df["lifestage"].unique()

array(['adult', nan, 'young', 'juvenile', 'subadult', 'unknown', 'U',
       'Adult', 'YOUNG', 'ADULT', 'undetermined', 'SUBADULT', 'Juvenile',
       'embryo', 'AD - T.', 'neonate', 'fr', 'Young', 'Immature',
       'mature', 'adullt', 'immature', 'JUVENILE', 'Juv', 'subadult?',
       'Unknown', 'Subadult', 'young adult', '87', 'juvenile; 23 day',
       'n', 'juv', '52 hours old', 'juvenile ?', 'not recorded',
       'IMMATURE', 'adult; embryo', 'Fr', 'juv.', 'Sub-adult', 'YG', 'Ad',
       'fetus', 'Adult.', 'adult.', '3 adult', 'AD - T.A', 'JUVELINE',
       'NEONATE', 'adulture', 'other', 'subadult/adult', 'juveniles',
       'young of year', 'N.P.', 'adult; foetus', 'Imm', 'larvae',
       'yearlings'], dtype=object)