Data Wrangling Notebook for Oregon Cougar Data
<br />
Neeka Sewnath
<br />
nsewnath@ufl.edu

Silence unnecessary warnings

In [1]:
try:
    import warnings
    warnings.filterwarnings('ignore')
except:
    pass

In [2]:
import pandas as pd
import numpy as np 
import uuid
import re

Import cougar data

In [3]:
# Import Oregon FWS Cougar Data 
cougar_data = pd.read_csv("../Original_Data/cougar_data.csv")

# Drop unnecessary rows 
cougar_data = cougar_data.iloc[4:]

# Create new header
new_header = cougar_data.iloc[0] 
cougar_data = cougar_data[1:] 
cougar_data.columns = new_header

# Import locality data
cougar_locality = pd.read_csv("../Original_Data/cougar_locality.csv")

Create verbatimLocality column by concatenating Management Unit and County

In [4]:
# Create verbatimLocality column 
cougar_data=cougar_data.assign(verbatimLocality = cougar_data['Management Unit'] 
                                                + ", "
                                                + cougar_data['County'])

Adjust locality information 

In [5]:
# Add lat and long columns
cougar_data=cougar_data.assign(decimalLatitude = "")
cougar_data=cougar_data.assign(decimalLongitude = "")

unit_name=cougar_locality["Unit Name"]
management_name=cougar_data["Management Unit"]

#Add coordinateUncertaintyInMeters column
cougar_data=cougar_data.assign(coordinateUncertaintyInMeters=50000)

# Match unit_name to management_name and transfer coordinate information
for i in management_name.index:
    for j in unit_name.index:
        if management_name[i]==unit_name[j]:
            cougar_data["decimalLatitude"][i]=cougar_locality["latitude"][j]
            cougar_data["decimalLongitude"][i]=cougar_locality["longitude"][j]
        elif management_name[i]=="McKenzie":
            cougar_data["decimalLatitude"][i]="44.1083926996967"
            cougar_data["decimalLongitude"][i]="-122.417312310006"
   

Create yearCollected column to deer data

In [6]:
cougar_data=cougar_data.assign(yearCollected = "")
cougar_data['yearCollected'] = cougar_data.Date.str[-4:]

Correct sex column

In [7]:
female = cougar_data['Sex']=="F"
male = cougar_data['Sex'] == "M"
cougar_data['Sex'][(female == False)&(male==False)]="not collected"
cougar_data['Sex'][female == True]="female"
cougar_data['Sex'][male == True]="male"

Create ageUnit Column and assign it to "year"

In [8]:
cougar_data = cougar_data.assign(ageUnit = "year")

Fix status column to use GEOME terms 

In [9]:
whole = cougar_data['Status']=="A"
gutted = cougar_data['Status']=="B"
skinned = cougar_data['Status']=="C"
skinned = cougar_data['Status']=="c"

cougar_data['Status'][whole == True] = "whole organism"
cougar_data['Status'][gutted == True] = "part organism"
cougar_data['Status'][skinned == True] = "part organism"

Select specified columns for final dataset

In [10]:
# Create column list
cols = cougar_data.columns.tolist()

# Specify desired columns
cols = ['verbatimLocality',
        'yearCollected',
        'decimalLatitude', 
        'decimalLongitude',
        'coordinateUncertaintyInMeters',
        'Date',
        'Sex',
        'ageUnit',
        'Status',
        'Age',
        'Weight',
        'Length']

# Subset dataframe
cougar_data = cougar_data[cols]

Matching column names to template 

In [11]:
# Matching template and column terms

# Renaming columns 
cougar_data = cougar_data.rename(columns = {'Sex':'sex',
                                            'Date':'verbatimEventDate',
                                            'Status':'materialSampleType',
                                            'Age': 'verbatimAgeValue'})

In [12]:
# Matching trait and ontology terms

# Renaming columns
cougar_data = cougar_data.rename(columns={'Weight': 'body mass',
                                          'Length': 'body length with tail'})

Create measurementUnit column

In [13]:
cougar_data = cougar_data.assign(measurementUnit="")

In [14]:
# Fill in blanks for required columns 
cougar_data=cougar_data.assign(country="USA")
cougar_data=cougar_data.assign(stateProvince="Oregon")
cougar_data=cougar_data.assign(basisOfRecord="PreservedSpecimen")
cougar_data=cougar_data.assign(scientificName="Puma concolor")
cougar_data=cougar_data.assign(locality="Unknown")
cougar_data=cougar_data.assign(samplingProtocol="Unknown")
cougar_data=cougar_data.assign(measurementMethod="Unknown")

Adding an additional column for ageValue

In [15]:
cougar_data=cougar_data.assign(ageValue="")
cougar_data["ageValue"]=cougar_data["verbatimAgeValue"]

Create necessary materialSampleID column and populate with UUID (use hex to remove dashes). Create necessary eventID column and populate with materialSampleID column

In [16]:
cougar_data=cougar_data.assign(materialSampleID = '')
cougar_data['materialSampleID'] = [uuid.uuid4().hex for _ in range(len(cougar_data.index))]

cougar_data=cougar_data.assign(eventID = cougar_data["materialSampleID"])

Create a long version of the data frame

In [17]:
# Creating long version, first specifiying keep variables, then naming variable and value
longVers=pd.melt(cougar_data, 
                id_vars=['verbatimLocality',
                         'yearCollected',
                         'sex',
                         'ageUnit',
                         'materialSampleType',
                         'verbatimAgeValue',
                         'ageValue',
                         'verbatimEventDate',
                         'country',
                         'stateProvince',
                         'eventID',
                         'locality',
                         'decimalLatitude', 
                         'decimalLongitude',
                         'coordinateUncertaintyInMeters',
                         'measurementMethod',
                         'samplingProtocol',
                         'basisOfRecord',
                         'scientificName',
                         'materialSampleID',
                         'measurementUnit'], 
                var_name = 'measurementType', 
                value_name = 'measurementValue')


Remove rows containing NA in measurementValue column

In [18]:
# If measurement value equals N/a, delete entire row
longVers = longVers.dropna(subset=['measurementValue'])

Populating measurementUnit column with appropriate measurement units in long version

In [19]:
long_body_mass_filter=longVers['measurementType']=="body mass"
long_no_body_filter=longVers['measurementType']!="body mass"

#Change "lb" to "g" and "in" to "mm". Conversion in next step
longVers['measurementUnit'][long_body_mass_filter] = "g"
longVers['measurementUnit'][long_no_body_filter] = "mm"

Converting "in" and "lb" to "g" and "mm"

In [20]:
def unit_clean(value, unit):
    """Cleans and converts measurementValue column"""
    if unit == "g":
    # Convert from pounds to grams  
        return int(value) * 453.59237
    else:
    # Convert from inches to millimeters    
        return int(value) * 25.4
    
# Clean and convert measurementValue column
longVers['measurementValue'] = longVers.apply(lambda x: unit_clean(x.measurementValue, x.measurementUnit), axis=1)

Create diagnosticID which is a unique number for each measurement

In [21]:
longVers=longVers.assign(diagnosticID = '')
longVers['diagnosticID'] = np.arange(len(longVers))

Writing long data csv file

In [22]:
longVers.to_csv('../Mapped_Data/FuTRES_Puma_concolor_ODFW_OR_USA_Modern.csv', index = False);