In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from scipy.stats import gmean

In [2]:
# Load life history traits for mammals from PanTHERIA DB. 
pantheria_df = pd.read_csv('../data/PanTHERIA_1-0_WR05_Aug2008.txt', sep='\t', na_values=[-999])

pantheria_df.columns.tolist()

['MSW05_Order',
 'MSW05_Family',
 'MSW05_Genus',
 'MSW05_Species',
 'MSW05_Binomial',
 '1-1_ActivityCycle',
 '5-1_AdultBodyMass_g',
 '8-1_AdultForearmLen_mm',
 '13-1_AdultHeadBodyLen_mm',
 '2-1_AgeatEyeOpening_d',
 '3-1_AgeatFirstBirth_d',
 '18-1_BasalMetRate_mLO2hr',
 '5-2_BasalMetRateMass_g',
 '6-1_DietBreadth',
 '7-1_DispersalAge_d',
 '9-1_GestationLen_d',
 '12-1_HabitatBreadth',
 '22-1_HomeRange_km2',
 '22-2_HomeRange_Indiv_km2',
 '14-1_InterbirthInterval_d',
 '15-1_LitterSize',
 '16-1_LittersPerYear',
 '17-1_MaxLongevity_m',
 '5-3_NeonateBodyMass_g',
 '13-2_NeonateHeadBodyLen_mm',
 '21-1_PopulationDensity_n/km2',
 '10-1_PopulationGrpSize',
 '23-1_SexualMaturityAge_d',
 '10-2_SocialGrpSize',
 '24-1_TeatNumber',
 '12-2_Terrestriality',
 '6-2_TrophicLevel',
 '25-1_WeaningAge_d',
 '5-4_WeaningBodyMass_g',
 '13-3_WeaningHeadBodyLen_mm',
 'References',
 '5-5_AdultBodyMass_g_EXT',
 '16-2_LittersPerYear_EXT',
 '5-6_NeonateBodyMass_g_EXT',
 '5-7_WeaningBodyMass_g_EXT',
 '26-1_GR_Area_km2',

In [3]:
cols_of_interest = [
    'MSW05_Order',
    'MSW05_Family',
    'MSW05_Genus',
    'MSW05_Species',
    'MSW05_Binomial',
    '5-1_AdultBodyMass_g',
    '8-1_AdultForearmLen_mm',
    '13-1_AdultHeadBodyLen_mm',
    '2-1_AgeatEyeOpening_d',
    '3-1_AgeatFirstBirth_d',
    '18-1_BasalMetRate_mLO2hr',
    '5-2_BasalMetRateMass_g',
    '7-1_DispersalAge_d',
    '9-1_GestationLen_d',
    '14-1_InterbirthInterval_d',
    '15-1_LitterSize',
    '16-1_LittersPerYear',
    '17-1_MaxLongevity_m',
    '5-3_NeonateBodyMass_g',
    '13-2_NeonateHeadBodyLen_mm',
    '23-1_SexualMaturityAge_d',
    '24-1_TeatNumber',
    '25-1_WeaningAge_d',
    '5-4_WeaningBodyMass_g',
    '13-3_WeaningHeadBodyLen_mm',
    'References']
new_colnames = [
    'Order',
    'Family',
    'Genus',
    'Species',
    'Binomial Name',
    'AdultBodyMass (g)',
    'AdultForearmLen (mm)',
    'AdultHeadBodyLen (mm)',
    'AgeatEyeOpening (days)',
    'AgeatFirstBirth (days)',
    'BasalMetRate (mLO2hr)',
    'BasalMetRateMass (g)',
    'DispersalAge (days)',
    'GestationLen (days)',
    'InterbirthInterval (d)',
    'LitterSize (number)',
    'LittersPerYear (number)',
    'MaxLongevity (months)',
    'NeonateBodyMass (g)',
    'NeonateHeadBodyLen (mm)',
    'SexualMaturityAge (days)',
    'TeatNumber (number)',
    'WeaningAge (days)',
    'WeaningBodyMass (g)',
    'WeaningHeadBodyLen (mm)',
    'References']
col_replacement_dict = dict((c, nc) for c, nc in zip(cols_of_interest, new_colnames))


In [4]:
# For the remaining columns, zero values are non-sensical - should be NaN for log-scale plotting
for c in cols_of_interest:
    mask = pantheria_df[c] == 0
    pantheria_df.loc[mask, c] = np.nan

# Drop everything that is missing data in all columns of interest. 
pantheria_df = pantheria_df.dropna(axis=0, how='all', subset=cols_of_interest)


In [6]:
# Start making a "clean" dataframe.
pantheria_clean_df = pantheria_df[cols_of_interest].rename(columns=col_replacement_dict)

# Calculate the number of young per year from the litter size and litters per year.
litter_size = pantheria_clean_df['LitterSize (number)']

# PanTHERIA gives two values that can be used to infer litters/year.
# 1/ LittersPerYear and 2/ InterbirthInterval
# As can be seen in the plot below, these values are correlated but not the same.
# Moreover, in some cases there is data for one and not the other.
litters_per_year = pantheria_clean_df['LittersPerYear (number)']
interbirth_interval_d = pantheria_clean_df['InterbirthInterval (d)']
litters_per_year_inferred = 365.0/interbirth_interval_d
young_per_year_litters = litter_size / litters_per_year
young_per_year_interval = litter_size / litters_per_year_inferred

pantheria_clean_df['YoungPerYear_Litters (number)'] = young_per_year_litters
pantheria_clean_df['YoungPerYear_Interval (number)'] = young_per_year_interval

# Estimate the number of young per year as the geometric mean of the two values.
def nan_gmean(a):
    """geometric mean that handles NaNs."""
    if not a.any():
        return np.NaN
    my_a = a[a.notnull()]
    return gmean(a[a.notnull()])

gmeans = pd.concat([young_per_year_litters, young_per_year_interval], axis=1).apply(nan_gmean, axis=1)
pantheria_clean_df['YoungPerYear_Estimated (number)'] = gmeans

neonate_mass_g = pantheria_clean_df['NeonateBodyMass (g)']
pantheria_clean_df['YoungMassPerYear_Estimated (g)'] = neonate_mass_g*gmeans

# Save a new cleaner file with units for the relevant columns.
pantheria_clean_df.to_csv('../data/PanTHERIA_1-0_WR05_Aug2008_cleaned.csv')
pantheria_clean_df.head()

Unnamed: 0,Order,Family,Genus,Species,Binomial Name,AdultBodyMass (g),AdultForearmLen (mm),AdultHeadBodyLen (mm),AgeatEyeOpening (days),AgeatFirstBirth (days),...,SexualMaturityAge (days),TeatNumber (number),WeaningAge (days),WeaningBodyMass (g),WeaningHeadBodyLen (mm),References,YoungPerYear_Litters (number),YoungPerYear_Interval (number),YoungPerYear_Estimated (number),YoungMassPerYear_Estimated (g)
0,Artiodactyla,Camelidae,Camelus,dromedarius,Camelus dromedarius,492714.47,,,,1651.62,...,1947.94,,389.38,,,511;543;719;1274;1297;1594;1654;1822;1848;2655...,0.98,1.649649,1.271478,46728.322149
1,Carnivora,Canidae,Canis,adustus,Canis adustus,10392.49,,745.32,,,...,249.88,8.0,52.89,,,542;543;730;1113;1297;1573;2655,,,,
2,Carnivora,Canidae,Canis,aureus,Canis aureus,9658.7,,827.53,7.5,,...,371.23,8.0,61.3,,,543;679;730;1113;1297;1573;2655,,3.74,3.74,792.2068
3,Carnivora,Canidae,Canis,latrans,Canis latrans,11989.1,,872.39,11.94,365.0,...,372.9,8.0,43.71,,,367;542;543;730;1113;1297;1573;1822;2655,,5.72,5.72,1144.0572
4,Carnivora,Canidae,Canis,lupus,Canis lupus,31756.51,,1055.0,14.01,547.5,...,679.37,9.0,44.82,,,367;542;543;730;1015;1052;1113;1297;1573;1594;...,2.49,4.98,3.521392,1451.905041
