In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the data from the UN FAO DAD-IS DB
# Domestic Animal Diversity Information System (DAD-IS)
dad_is_df = pd.read_csv('../data/102821_FAO_DAD-IS_export.csv')
dad_is_df = dad_is_df.replace({'Duck (domestic)': 'Duck', 'Duck(domestic) Muscovy duck': 'Muscovy Duck',
                               'Goose (domestic)': 'Goose', 'Dromedary Bactrian Camel': 'Bactrian Camel',
                               'Vicuña': 'Vicuna', 'Yak (domestic)': 'Yak'})


dad_is_df.columns.tolist()

['Country',
 'ISO3',
 'Specie',
 'Breed/Most common name',
 'Description Of Specific Uses',
 'Lang',
 'Description',
 'Transboundary name',
 'Other name',
 'Uses',
 'Additional information',
 'Additional information comments',
 'Efabis cultural role comment',
 'Efabis cultural value',
 'Adaptability to specific environment',
 'Specific resistance or tolerance',
 'Specific reproductive characteristic',
 'Special characteristic of product',
 'Other special qualities',
 'Reference for special qualities',
 'Efabis genetic features',
 'Efabis environmental role',
 'Efabis adaptability to marginal land',
 'Body conformation',
 'Coat description',
 'Coat quality',
 'Comb type',
 'Skin colour',
 'Shank and foot colour',
 'Plumage colour',
 'Pattern within feather',
 'Avian classification',
 'Color comments',
 'Efabis main colour',
 'Efabis skin colour',
 'Number of horns males',
 'Number of horns females',
 'Horn shape size and comments',
 'Wither height males',
 'Wither height females',
 'Wei

In [3]:
dad_is_df.Specie.unique()

array(['Ass', 'Cattle', 'Chicken', 'Dromedary', 'Goat', 'Horse', 'Sheep',
       'Yak', 'Pig', 'Buffalo', 'Turkey', 'Guanaco', 'Llama', 'Vicuna',
       'Rabbit', 'Deer', 'Duck', 'Goose', 'Guinea fowl', 'Muscovy duck',
       'Peacock', 'Pigeon', 'Alpaca', 'Cassowary', 'Dog',
       'Bactrian Camel', 'Emu', 'Ostrich', 'Bactrian camel', 'Nandu',
       'Partridge', 'Pheasant', 'Quail', 'Muscovy Duck', 'Guinea pig',
       'American Bison', 'Swallow'], dtype=object)

In [4]:
# Drop everything that is missing data in all columns of interest. 
cols_of_interest = ['Weight males', 'Weight females', 'Birth weight males', 'Birth weight females',
                    'Age maturity males', 'Age maturity females', 'Age breeding males', 'Age breeding females',
                    'Age first parturition AVG', 'Age first parturition MIN', 'Age first parturition MAX',
                    'Parturition interval AVG', 'Parturition interval MIN', 'Parturition interval MAX',
                    'Length productive life', 'Daily gain', 'Carcass weight', 'Dressing percentage',
                    'Litter size AVG', 'Litter size MIN', 'Litter size MAX']
# List of units for columns above, gleaned from GUI interface here:
# https://www.fao.org/dad-is/browse-by-country-and-species/en/
units = ['kg']*4 + ['months']*7 + ['day']*3 + ['year', 'g', 'kg', '%'] + ['number']*3
new_colnames = ['{0} ({1})'.format(c, u) for c, u in zip(cols_of_interest, units)]

# For the remaining columns, zero values are non-sensical - should be NaN for log-scale plotting
for c in cols_of_interest:
    mask = dad_is_df[c] == 0
    dad_is_df.loc[mask, c] = np.nan
    
dad_is_df = dad_is_df.dropna(axis=0, how='all', subset=cols_of_interest)
    
# Calculate the number of young per year from the litter size and interval between litters.
# NOTE: not 100% sure this is what the "Parturition interval" means
litter_size = dad_is_df['Litter size AVG']

# Have to convert 0 interval into NaN otherwise we get infinite young per year
litter_interval = dad_is_df['Parturition interval AVG']

young_per_year = litter_size / (litter_interval/365)
dad_is_df['Young per year AVG (number)'] = young_per_year

col_replacement_dict = dict((c, nc) for c, nc in zip(cols_of_interest, new_colnames))
dad_is_df = dad_is_df.rename(columns=col_replacement_dict)

# Save a new cleaner file with units for the relevant columns.
dad_is_df.to_csv('../data/FAO_DAD-IS_cleaned.csv')
dad_is_df.shape

(5544, 112)

In [5]:
# Save number of individual breeds for the relevant columns
counts_by_species = dad_is_df.groupby('Specie')['Breed/Most common name'].count()
counts_by_species.name = 'breed_count'
counts_by_species.to_csv('../data/FAO_DAD-IS_breed_counts_by_species.csv')

In [6]:
# Print the species for which there are > 100 breed representatives
mask = counts_by_species > 100
species_enough_data = counts_by_species[mask].index.tolist()

In [7]:
# Save quantiles for the relevant columns for the species that have enough data.
new_colnames += ['Young per year AVG (number)']
mask = dad_is_df.Specie.isin(species_enough_data)
quantiles_by_species = dad_is_df[mask].groupby('Specie').quantile([0.005, 0.05, 0.25, 0.5, 0.75, 0.95, 0.995])[new_colnames]
quantiles_by_species.index = quantiles_by_species.index.set_names(['Specie', 'quantile'])
quantiles_by_species.to_csv('../data/FAO_DAD-IS_quantiles_by_species.csv')

In [8]:
quantiles_by_species

Unnamed: 0_level_0,Unnamed: 1_level_0,Weight males (kg),Weight females (kg),Birth weight males (kg),Birth weight females (kg),Age maturity males (months),Age maturity females (months),Age breeding males (months),Age breeding females (months),Age first parturition AVG (months),Age first parturition MIN (months),...,Parturition interval MIN (day),Parturition interval MAX (day),Length productive life (year),Daily gain (g),Carcass weight (kg),Dressing percentage (%),Litter size AVG (number),Litter size MIN (number),Litter size MAX (number),Young per year AVG (number)
Specie,quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Buffalo,0.005,307.780,300.000,16.320,15.305,18.22,20.145,24.00,26.660,24.00,22.00,...,67.2,83.245,8.0475,228.125,137.625,23.7370,1.000,1.000,1.00,0.582861
Buffalo,0.050,350.000,320.100,18.012,16.060,20.00,21.000,24.00,30.000,29.90,22.80,...,320.4,427.450,8.4750,262.500,161.250,41.7300,1.000,1.000,1.00,0.614327
Buffalo,0.250,429.275,382.250,25.000,24.200,24.00,25.250,33.80,32.000,36.00,34.00,...,377.0,501.500,10.7500,361.750,187.500,44.0000,1.000,1.000,1.00,0.741730
Buffalo,0.500,520.500,448.250,30.000,27.950,30.00,30.000,36.00,36.000,42.00,36.00,...,400.0,574.000,13.5000,542.500,240.500,48.5000,1.000,1.000,2.00,0.869048
Buffalo,0.750,600.000,494.500,33.000,30.750,33.00,33.405,45.00,45.175,47.50,45.00,...,455.0,645.000,15.0000,680.000,253.250,50.0000,1.000,1.000,2.00,0.939550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sheep,0.250,50.125,40.000,2.840,2.525,7.00,8.000,12.00,12.000,14.00,12.00,...,145.0,155.000,6.0000,141.250,13.000,45.6000,1.100,1.000,1.48,1.300000
Sheep,0.500,72.000,51.000,3.500,3.200,10.00,10.000,18.00,18.000,18.00,15.00,...,220.0,313.500,7.0000,200.000,19.000,48.4500,1.270,1.000,2.00,1.701667
Sheep,0.750,95.000,66.000,4.100,3.750,12.00,12.000,36.00,48.000,20.00,18.00,...,297.5,381.250,8.0000,254.250,23.250,52.0000,1.540,1.050,3.00,2.500250
Sheep,0.950,120.000,82.910,5.000,4.622,18.00,18.000,48.00,60.000,24.00,23.65,...,335.0,417.000,9.0000,350.000,40.000,59.7400,2.024,1.400,4.00,3.041667
