In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the data from the UN FAO DAD-IS DB
# Domestic Animal Diversity Information System (DAD-IS)
dad_is_df = pd.read_csv('../data/102821_FAO_DAD-IS_export.csv')
dad_is_df = dad_is_df.replace({'Duck (domestic)': 'Duck', 'Duck(domestic) Muscovy duck': 'Muscovy Duck',
                               'Goose (domestic)': 'Goose', 'Dromedary Bactrian Camel': 'Bactrian Camel',
                               'Vicuña': 'Vicuna', 'Yak (domestic)': 'Yak'})

cols_of_interest = ['Weight males', 'Weight females', 'Birth weight males', 'Birth weight females',
                    'Age maturity males', 'Age maturity females', 'Age breeding males', 'Age breeding females',
                    'Age first parturition AVG', 'Age first parturition MIN', 'Age first parturition MAX',
                    'Parturition interval AVG', 'Parturition interval MIN', 'Parturition interval MAX',
                    'Length productive life', 'Daily gain', 'Carcass weight', 'Dressing percentage',
                    'Litter size AVG', 'Litter size MIN', 'Litter size MAX']

# List of units for columns above, gleaned from GUI interface here:
# https://www.fao.org/dad-is/browse-by-country-and-species/en/
units = ['kg']*4 + ['months']*7 + ['d']*3 + ['year', 'g', 'kg', '%'] + ['number']*3
new_colnames = ['{0} ({1})'.format(c, u) for c, u in zip(cols_of_interest, units)]

# Zero values are non-sensical for all the columns - should be NaN for log-scale plotting
for c in cols_of_interest:
    mask = dad_is_df[c] == 0
    dad_is_df.loc[mask, c] = np.nan
    
# Drop everything that is missing data in all columns of interest. 
dad_is_df = dad_is_df.dropna(axis=0, how='all', subset=cols_of_interest)

#dad_is_df.columns.tolist()

The next few cells are printing out some outlier data to document some clear errors in DAD-IS. 

In [3]:
# Look at some of the absurd litter sizes in the dataset - several > 100!
# Hiding the pigs since commercial pigs can have litters > 10. 
mask = np.logical_and(dad_is_df['Litter size AVG'] > 10,
                      dad_is_df.Specie != 'Pig')
cols =  ['Specie', 'Breed/Most common name', 'Parturition interval MIN',
         'Parturition interval AVG', 'Parturition interval MAX',
         'Litter size AVG', 'Litter size MIN', 'Litter size MAX', 
         'Weight males', 'Birth weight males',
         'Weight females', 'Birth weight females']
dad_is_df.loc[mask, cols]

Unnamed: 0,Specie,Breed/Most common name,Parturition interval MIN,Parturition interval AVG,Parturition interval MAX,Litter size AVG,Litter size MIN,Litter size MAX,Weight males,Birth weight males,Weight females,Birth weight females
6803,Goat,Boer goat,,,,35.0,,,,,,
9660,Goat,Macedonian Goat,,,,15.0,,,35.0,2.5,45.0,2.2
10909,Goat,Aljabal Alakhdar,,149.0,,128.0,125.0,130.0,36.0,3.34,36.0,3.05
10913,Goat,Jabbali,148.0,152.0,156.0,126.0,120.0,133.0,36.0,3.3,31.0,2.9
13557,Sheep,Barbarine,,,,112.0,,,,3.5,,3.2
13559,Sheep,Noire de Thibar,,,,130.0,130.0,160.0,,4.32,,
13561,Sheep,Sicilo-Sarde,,,,144.0,,,70.0,3.4,45.0,3.2


In [4]:
# Look at some absurd birth weights that well-exceed the listed adult weights.
mask = np.logical_or(dad_is_df['Birth weight males'] > 0.9*dad_is_df['Weight males'],
                     dad_is_df['Birth weight females'] > 0.9*dad_is_df['Weight females'])

dad_is_df.loc[mask, cols]

Unnamed: 0,Specie,Breed/Most common name,Parturition interval MIN,Parturition interval AVG,Parturition interval MAX,Litter size AVG,Litter size MIN,Litter size MAX,Weight males,Birth weight males,Weight females,Birth weight females
1398,Sheep,Privorska,,,,1.0,,,52.0,46.0,40.0,44.0
2397,Cattle,Tibetan,,,,,,,215.0,215.8,197.0,197.7
7320,Rabbit,Rexsi Agrinak,14.0,14.0,16.0,,7.0,8.0,2.9,49.94,2.7,55.27
8892,Chicken,ALARABY CHICKENS,,,,,,,1.9,1.9,1.5,1.5
11276,Chicken,Banaba,,,,,,,1.6,29.86,1.5,29.86
13391,Chicken,Khiao Hu0ai Sai chicken,,,,,,,3.5,3.5,2.5,2.5
14028,Rabbit,Californian,,,,,8.0,10.0,4.4,450.0,4.4,


In [5]:
# Look at some absurd partruition intervals (inter-birth intervals) that are too short to be reasonable. 
# ... From the website the parturition interval appears to be reported in days. 
# Even for a rabbit this is too short...
mask = dad_is_df['Parturition interval AVG'] < 20 
dad_is_df.loc[mask, cols]

Unnamed: 0,Specie,Breed/Most common name,Parturition interval MIN,Parturition interval AVG,Parturition interval MAX,Litter size AVG,Litter size MIN,Litter size MAX,Weight males,Birth weight males,Weight females,Birth weight females
205,Goat,Criolla del Sur de Mendoza,14.0,12.0,10.0,1.3,1.0,3.0,79.6,2.78,46.79,2.54
892,Cattle,Blanc-Bleu type mixte,10.0,13.0,18.0,,,,1100.0,46.0,700.0,42.0
5086,Sheep,Black Head Somali,,10.46,,,,,29.5,,25.8,
6883,Goat,Creole,,13.0,,1.75,,,,,,
7147,Cattle,Bali,13.0,14.5,17.0,,1.0,3.0,475.0,18.0,250.0,20.0
7151,Cattle,Donggala,15.0,18.0,24.0,,1.0,1.0,266.0,,266.0,
7162,Cattle,Pesisir (sum ),330.0,12.0,390.0,,,,200.0,13.0,150.0,11.0
7168,Cattle,Sumbawa,12.0,14.0,17.0,,1.0,,400.0,26.0,300.0,26.0
7320,Rabbit,Rexsi Agrinak,14.0,14.0,16.0,,7.0,8.0,2.9,49.94,2.7,55.27
8312,Cattle,Puerto Rican,,15.0,,,,,,,,


As documented above, the DAD-IS dataset contains some outliers that are clearly errors. The cell below removes them.

In [6]:
# Birth weights should not be remotely similar to adult weights.
mask = dad_is_df['Birth weight males'] > 0.9*dad_is_df['Weight males']
dad_is_df.loc[mask, 'Birth weight males'] = np.NaN
dad_is_df.loc[mask, 'Weight males'] = np.NaN

mask = dad_is_df['Birth weight females'] > 0.9*dad_is_df['Weight females']
dad_is_df.loc[mask, 'Birth weight females'] = np.NaN
dad_is_df.loc[mask, 'Weight females'] = np.NaN

# None of these animals has a litter size > 30 (being very conservative)
litter_cols = ['Litter size AVG', 'Litter size MIN', 'Litter size MAX']

# Species where litters are surely < 10 (being very conservative, see comment below).
species_l10 = 'Goat,Cattle,Sheep'.split(',')
for lc in litter_cols:
    mask = dad_is_df[lc] > 30
    dad_is_df.loc[mask, lc] = np.NaN
    
    mask = np.logical_and(dad_is_df[lc] > 10, dad_is_df.Specie.isin(species_l10))
    dad_is_df.loc[mask, lc] = np.NaN
    
# There are a few goat breeds marked as having litter sizes > 10, even > 100.
# These numbers are absurd and must be in error. 
# Data show that even quadruplets are rare, <2% of pregnancies and <3% of live young.
# These numbers do depend on the genotype, however, meaning that larger litters are selectable.
# See references on goat litter sizes: 
# 1. M. Peaker, Gestation period and litter size in the goat. Br. Vet. J. 134, 379–383 (1978).
# 2. M. Mellado, et al., Relationship between litter birthweight and litter size in five goat genotypes. Anim. Produc. Sci. 51, 490–490 (2011).
    
# None of these animals has an interbirth interval less than 20 days (being very conservative)
# Even rabbits have a gestation period > 20 days (≈30 days)
# See W. K. Wilson, F. J. Dudley, The duration of gestation in rabbit breeds and crosses. J. Genet. 50, 384–391 (1952).
interval_cols = ['Parturition interval AVG', 'Parturition interval MIN', 'Parturition interval MAX']
for ic in interval_cols:
    mask = dad_is_df[ic] <= 20
    dad_is_df.loc[mask, ic] = np.NaN


The cell below calculates derived data for the remaining species and puts units in the column names for clarity. 

In [7]:
# Calculate the number of young per year from the litter size and interval between litters.
litter_size = dad_is_df['Litter size AVG']

# Have to convert 0 interval into NaN otherwise we get infinite young per year
litter_interval = dad_is_df['Parturition interval AVG']

young_per_year = litter_size / (litter_interval/365)
dad_is_df['Young per year AVG (number)'] = young_per_year

mean_adult_weight_kg = dad_is_df[['Weight males', 'Weight females']].mean(axis=1)
dad_is_df['Adult weight AVG (kg)'] = mean_adult_weight_kg
dad_is_df['Adult weight AVG (g)'] = mean_adult_weight_kg*1000

mean_birth_weight_kg = dad_is_df[['Birth weight males', 'Birth weight females']].mean(axis=1)
dad_is_df['Birth weight AVG (kg)'] = mean_birth_weight_kg
dad_is_df['Birth weight AVG (g)'] = mean_birth_weight_kg*1000

neonate_mass_kg = dad_is_df['Birth weight AVG (kg)']
young_per_year = dad_is_df['Young per year AVG (number)']
dad_is_df['Young mass per year AVG (kg)'] = neonate_mass_kg*young_per_year
dad_is_df['Young mass per year AVG (g)'] = neonate_mass_kg*young_per_year*1000

# Put units in the relevant column names
col_replacement_dict = dict((c, nc) for c, nc in zip(cols_of_interest, new_colnames))
dad_is_clean_df = dad_is_df.rename(columns=col_replacement_dict)

# Save a new cleaner file with units for the relevant columns.
dad_is_clean_df.to_csv('../data/FAO_DAD-IS_cleaned.csv')
dad_is_clean_df.shape

(5544, 118)

In [8]:
# Save number of individual breeds for the relevant columns
counts_by_species = dad_is_clean_df.groupby('Specie')['Breed/Most common name'].count()
counts_by_species.name = 'breed_count'
counts_by_species.to_csv('../data/FAO_DAD-IS_breed_counts_by_species.csv')

In [9]:
# Print the species for which there are > 100 breed representatives
mask = counts_by_species > 100
species_enough_data = counts_by_species[mask].index.tolist()
species_enough_data

['Buffalo',
 'Cattle',
 'Chicken',
 'Duck',
 'Goat',
 'Horse',
 'Pig',
 'Rabbit',
 'Sheep']

In [10]:
# Save quantiles for the relevant columns for the species that have enough data.
new_colnames += ['Young per year AVG (number)', 'Adult weight AVG (kg)']
mask = dad_is_clean_df.Specie.isin(species_enough_data)
quantiles_by_species = dad_is_clean_df[mask].groupby('Specie').quantile([0.005, 0.05, 0.25, 0.5, 0.75, 0.95, 0.995])[new_colnames]
quantiles_by_species.index = quantiles_by_species.index.set_names(['Specie', 'quantile'])
quantiles_by_species.to_csv('../data/FAO_DAD-IS_quantiles_by_species.csv')
quantiles_by_species.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Weight males (kg),Weight females (kg),Birth weight males (kg),Birth weight females (kg),Age maturity males (months),Age maturity females (months),Age breeding males (months),Age breeding females (months),Age first parturition AVG (months),Age first parturition MIN (months),...,Parturition interval MAX (d),Length productive life (year),Daily gain (g),Carcass weight (kg),Dressing percentage (%),Litter size AVG (number),Litter size MIN (number),Litter size MAX (number),Young per year AVG (number),Adult weight AVG (kg)
Specie,quantile,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Buffalo,0.005,307.78,300.0,16.32,15.305,18.22,20.145,24.0,26.66,24.0,22.0,...,83.245,8.0475,228.125,137.625,23.737,1.0,1.0,1.0,0.582861,306.475
Buffalo,0.05,350.0,320.1,18.012,16.06,20.0,21.0,24.0,30.0,29.9,22.8,...,427.45,8.475,262.5,161.25,41.73,1.0,1.0,1.0,0.614327,337.0
Buffalo,0.25,429.275,382.25,25.0,24.2,24.0,25.25,33.8,32.0,36.0,34.0,...,501.5,10.75,361.75,187.5,44.0,1.0,1.0,1.0,0.74173,400.0
Buffalo,0.5,520.5,448.25,30.0,27.95,30.0,30.0,36.0,36.0,42.0,36.0,...,574.0,13.5,542.5,240.5,48.5,1.0,1.0,2.0,0.869048,475.125
Buffalo,0.75,600.0,494.5,33.0,30.75,33.0,33.405,45.0,45.175,47.5,45.0,...,645.0,15.0,680.0,253.25,50.0,1.0,1.0,2.0,0.93955,541.25


In [11]:
species_enough_data

['Buffalo',
 'Cattle',
 'Chicken',
 'Duck',
 'Goat',
 'Horse',
 'Pig',
 'Rabbit',
 'Sheep']