In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the data from the UN FAO DAD-IS DB
# Domestic Animal Diversity Information System (DAD-IS)
dad_is_df = pd.read_csv('../data/102821_FAO_DAD-IS_export.csv')
dad_is_df = dad_is_df.replace({'Duck (domestic)': 'Duck', 'Duck(domestic) Muscovy duck': 'Muscovy Duck',
                               'Goose (domestic)': 'Goose', 'Dromedary Bactrian Camel': 'Bactrian Camel',
                               'Vicuña': 'Vicuna', 'Yak (domestic)': 'Yak'})


dad_is_df.columns.tolist()

['Country',
 'ISO3',
 'Specie',
 'Breed/Most common name',
 'Description Of Specific Uses',
 'Lang',
 'Description',
 'Transboundary name',
 'Other name',
 'Uses',
 'Additional information',
 'Additional information comments',
 'Efabis cultural role comment',
 'Efabis cultural value',
 'Adaptability to specific environment',
 'Specific resistance or tolerance',
 'Specific reproductive characteristic',
 'Special characteristic of product',
 'Other special qualities',
 'Reference for special qualities',
 'Efabis genetic features',
 'Efabis environmental role',
 'Efabis adaptability to marginal land',
 'Body conformation',
 'Coat description',
 'Coat quality',
 'Comb type',
 'Skin colour',
 'Shank and foot colour',
 'Plumage colour',
 'Pattern within feather',
 'Avian classification',
 'Color comments',
 'Efabis main colour',
 'Efabis skin colour',
 'Number of horns males',
 'Number of horns females',
 'Horn shape size and comments',
 'Wither height males',
 'Wither height females',
 'Wei

In [3]:
cols_of_interest = ['Weight males', 'Weight females', 'Birth weight males', 'Birth weight females',
                    'Age maturity males', 'Age maturity females', 'Age breeding males', 'Age breeding females',
                    'Age first parturition AVG', 'Age first parturition MIN', 'Age first parturition MAX',
                    'Parturition interval AVG', 'Parturition interval MIN', 'Parturition interval MAX',
                    'Length productive life', 'Daily gain', 'Carcass weight', 'Dressing percentage',
                    'Litter size AVG', 'Litter size MIN', 'Litter size MAX']
# List of units for columns above, gleaned from GUI interface here:
# https://www.fao.org/dad-is/browse-by-country-and-species/en/
units = ['kg']*4 + ['months']*7 + ['day']*3 + ['year', 'g', 'kg', '%'] + ['number']*3
new_colnames = ['{0} ({1})'.format(c, u) for c, u in zip(cols_of_interest, units)]

# Zero values are non-sensical - should be NaN for log-scale plotting
for c in cols_of_interest:
    mask = dad_is_df[c] == 0
    dad_is_df.loc[mask, c] = np.nan
    
# Drop everything that is missing data in all columns of interest. 
dad_is_df = dad_is_df.dropna(axis=0, how='all', subset=cols_of_interest)
    
# Calculate the number of young per year from the litter size and interval between litters.
# NOTE: not 100% sure this is what the "Parturition interval" means
litter_size = dad_is_df['Litter size AVG']

# Have to convert 0 interval into NaN otherwise we get infinite young per year
litter_interval = dad_is_df['Parturition interval AVG']

young_per_year = litter_size / (litter_interval/365)
dad_is_df['Young per year AVG (number)'] = young_per_year

mean_adult_weight_kg = dad_is_df[['Weight males', 'Weight females']].mean(axis=1)
dad_is_df['Adult weight AVG (kg)'] = mean_adult_weight_kg
dad_is_df['Adult weight AVG (g)'] = mean_adult_weight_kg*1000

mean_birth_weight_kg = dad_is_df[['Birth weight males', 'Birth weight females']].mean(axis=1)
dad_is_df['Birth weight AVG (kg)'] = mean_birth_weight_kg
dad_is_df['Birth weight AVG (g)'] = mean_birth_weight_kg*1000

neonate_mass_kg = dad_is_df['Birth weight AVG (kg)']
young_per_year = dad_is_df['Young per year AVG (number)']
dad_is_df['Young mass per year AVG (kg)'] = neonate_mass_kg*young_per_year
dad_is_df['Young mass per year AVG (g)'] = neonate_mass_kg*young_per_year*1000

# Put units in the relevant column names
col_replacement_dict = dict((c, nc) for c, nc in zip(cols_of_interest, new_colnames))
dad_is_clean_df = dad_is_df.rename(columns=col_replacement_dict)

# Save a new cleaner file with units for the relevant columns.
dad_is_clean_df.to_csv('../data/FAO_DAD-IS_cleaned.csv')
dad_is_clean_df.shape

(5544, 118)

In [4]:
# Save number of individual breeds for the relevant columns
counts_by_species = dad_is_clean_df.groupby('Specie')['Breed/Most common name'].count()
counts_by_species.name = 'breed_count'
counts_by_species.to_csv('../data/FAO_DAD-IS_breed_counts_by_species.csv')

In [5]:
# Print the species for which there are > 100 breed representatives
mask = counts_by_species > 100
species_enough_data = counts_by_species[mask].index.tolist()

In [6]:
# Save quantiles for the relevant columns for the species that have enough data.
new_colnames += ['Young per year AVG (number)', 'Adult weight AVG (kg)']
mask = dad_is_clean_df.Specie.isin(species_enough_data)
quantiles_by_species = dad_is_clean_df[mask].groupby('Specie').quantile([0.005, 0.05, 0.25, 0.5, 0.75, 0.95, 0.995])[new_colnames]
quantiles_by_species.index = quantiles_by_species.index.set_names(['Specie', 'quantile'])
quantiles_by_species.to_csv('../data/FAO_DAD-IS_quantiles_by_species.csv')
quantiles_by_species.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Weight males (kg),Weight females (kg),Birth weight males (kg),Birth weight females (kg),Age maturity males (months),Age maturity females (months),Age breeding males (months),Age breeding females (months),Age first parturition AVG (months),Age first parturition MIN (months),...,Parturition interval MAX (day),Length productive life (year),Daily gain (g),Carcass weight (kg),Dressing percentage (%),Litter size AVG (number),Litter size MIN (number),Litter size MAX (number),Young per year AVG (number),Adult weight AVG (kg)
Specie,quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Buffalo,0.005,307.78,300.0,16.32,15.305,18.22,20.145,24.0,26.66,24.0,22.0,...,83.245,8.0475,228.125,137.625,23.737,1.0,1.0,1.0,0.582861,306.475
Buffalo,0.05,350.0,320.1,18.012,16.06,20.0,21.0,24.0,30.0,29.9,22.8,...,427.45,8.475,262.5,161.25,41.73,1.0,1.0,1.0,0.614327,337.0
Buffalo,0.25,429.275,382.25,25.0,24.2,24.0,25.25,33.8,32.0,36.0,34.0,...,501.5,10.75,361.75,187.5,44.0,1.0,1.0,1.0,0.74173,400.0
Buffalo,0.5,520.5,448.25,30.0,27.95,30.0,30.0,36.0,36.0,42.0,36.0,...,574.0,13.5,542.5,240.5,48.5,1.0,1.0,2.0,0.869048,475.125
Buffalo,0.75,600.0,494.5,33.0,30.75,33.0,33.405,45.0,45.175,47.5,45.0,...,645.0,15.0,680.0,253.25,50.0,1.0,1.0,2.0,0.93955,541.25
