# Burren Swiss goat
Here are some goat samples cominig from [Burren et. al 2016](https://doi.org/10.1111/age.12476)

In [1]:
import csv
import pathlib

import pandas as pd

from tqdm.notebook import tqdm

from src.features.plinkio import TextPlinkIO
from src.features.smarterdb import global_connection, SampleGoat, Dataset, Breed
from src.data.common import WORKING_ASSEMBLIES

In [2]:
conn = global_connection()
ARS1 = WORKING_ASSEMBLIES["ARS1"]

In [3]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException as exc:
            logger.error(exc)
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException as exc:
            logger.error(exc)
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException as exc:
            logger.error(exc)
            return False
        
    def is_illumina(self):
        try:
            return self.process_pedfile(coding='illumina')
        
        except CodingException as exc:
            logger.error(exc)
            return False
        
class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

In [4]:
burren_dataset = Dataset.objects.get(file="burren_et_al_2016.zip")
burren_dataset.contents

['doi_10.5061_dryad.q1cv6__v1/',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0006-figs6.pdf',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0003-figs3.pdf',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0005-figs5.pdf',
 'doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad.log',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0010-tables2.pdf',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0011-tables3.pdf',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0009-tables1.pdf',
 'doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad.tfam',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0001-figs1.pdf',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0012-tables4.xlsx',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0002-figs2.pdf',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0008-figs8.pdf',
 'doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad_recoding.xlsx',
 'doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad.nosex',
 'doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad.map',
 'doi_10.5061_dryad.q1cv6__v1/age12476-sup-0004-figs4.pdf',
 'doi_10.5061_dryad.

This dataset was originally a transposed PLINK file, so it was converted into a standard plink text file and added to dataset

In [5]:
prefix = str(burren_dataset.working_dir / "doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad")
plinkio = CustomTextPlinkIO(prefix=prefix, species=burren_dataset.species, chip_name=burren_dataset.chip_name)
plinkio.n_of_individuals = burren_dataset.n_of_individuals

In [6]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(src_assembly=ARS1)

In [7]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 48019 of 48019 SNPs using 'name' (0.0% missing)


Is this dataset in top coordinates?

In [8]:
plinkio.is_top()

  0%|          | 0/473 [00:00<?, ?it/s]

True

In [9]:
breeds = set()

for line in plinkio.read_pedfile():
    breed = line[0]
    if breed not in breeds:
        breeds.add(breed)
    
print(f"Got {breeds} breeds")

Got {'1'} breeds


Well, here's the first problem: All the animals have the same breed (which is *1*). I have an additional metadata file with the proper sample - breed reported, with no header and with different tables on the same sheet

In [10]:
metadata = pd.read_excel(burren_dataset.working_dir / "doi_10.5061_dryad.q1cv6__v1/goat_data2_dryad_recoding.xlsx", header=None)
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       473 non-null    int64  
 1   1       473 non-null    object 
 2   2       473 non-null    int64  
 3   3       473 non-null    int64  
 4   4       473 non-null    int64  
 5   5       473 non-null    int64  
 6   6       0 non-null      float64
 7   7       473 non-null    int64  
 8   8       473 non-null    object 
 9   9       473 non-null    int64  
 10  10      473 non-null    int64  
 11  11      473 non-null    int64  
 12  12      473 non-null    int64  
 13  13      0 non-null      float64
 14  14      0 non-null      float64
 15  15      11 non-null     object 
 16  16      11 non-null     object 
dtypes: float64(3), int64(10), object(4)
memory usage: 62.9+ KB


In [11]:
metadata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,2,BST01,0,0,0,-9,,1,goat1,0,0,0,-9,,,Breed Number,Breed abbreviation publication Animal Genetics
1,2,BST02,0,0,0,-9,,1,goat2,0,0,0,-9,,,1,APP
2,2,BST06,0,0,0,-9,,1,goat3,0,0,0,-9,,,2,GST
3,2,BST07,0,0,0,-9,,1,goat4,0,0,0,-9,,,3,TGR
4,2,BST13,0,0,0,-9,,1,goat5,0,0,0,-9,,,4,CHA


Ok get rid of the columns I don't need:

In [12]:
metadata.drop([2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14], axis=1, inplace=True)
metadata.head()

Unnamed: 0,0,1,8,15,16
0,2,BST01,goat1,Breed Number,Breed abbreviation publication Animal Genetics
1,2,BST02,goat2,1,APP
2,2,BST06,goat3,2,GST
3,2,BST07,goat4,3,TGR
4,2,BST13,goat5,4,CHA


Try to extract breed abbreviation with their numbers:

In [13]:
breeds = metadata[[15, 16]].dropna()
# take all except the first row
breeds.drop(0, axis=0, inplace=True)
breeds.columns = ["Number", "Code"]
breeds

Unnamed: 0,Number,Code
1,1,APP
2,2,GST
3,3,TGR
4,4,CHA
5,5,VAG
6,6,NVE
7,7,PEA
8,8,SAA
9,9,SGB
10,10,TOG


Create a dictionary with number to code:

In [14]:
idx2code = {}
for index, row in breeds.iterrows():
    idx2code[row["Number"]] = row["Code"]
idx2code

{1: 'APP',
 2: 'GST',
 3: 'TGR',
 4: 'CHA',
 5: 'VAG',
 6: 'NVE',
 7: 'PEA',
 8: 'SAA',
 9: 'SGB',
 10: 'TOG'}

Now return to metadata and try to assign codes:

In [15]:
samples = metadata[[0, 1, 8]].copy()
samples["Code"] = samples[0].apply(lambda x: idx2code[x])
samples.columns = ["code_number", "original_id", "alias", "code"]
samples.head()

Unnamed: 0,code_number,original_id,alias,code
0,2,BST01,goat1,GST
1,2,BST02,goat2,GST
2,2,BST06,goat3,GST
3,2,BST07,goat4,GST
4,2,BST13,goat5,GST


Try to define full breed name, relying on SMARTER database:

In [16]:
code2breed = {
    'APP': 'Appenzell',
    'GST': 'Grisons striped',
    'TGR': 'Tessin grey',
    'CHA': 'Chamois colored',
    'VAG': 'Valais',
    'NVE': 'Nera Verzasca',
    'PEA': 'Peacock',
    'SAA': 'Saanen',
    'SGB': 'Booted',
    'TOG': 'Toggenburg'
}

Ensure code are new or associated to the same breed within the database:

In [17]:
for code, name in code2breed.items():
    count = Breed.objects.filter(code=code, species="Goat").count()
    if count != 0:
        breed = Breed.objects.get(code=code, species="Goat")
        print(f"Found code '{code}': '{breed}' for Burren '{name}'")

Found code 'APP': 'Appenzell (APP) Goat' for Burren 'Appenzell'
Found code 'GST': 'Grisons striped (GST) Goat' for Burren 'Grisons striped'
Found code 'TGR': 'Tessin grey (TGR) Goat' for Burren 'Tessin grey'
Found code 'CHA': 'Chappar (CHA) Goat' for Burren 'Chamois colored'
Found code 'VAG': 'Valais (VAG) Goat' for Burren 'Valais'
Found code 'NVE': 'Nera Verzasca (NVE) Goat' for Burren 'Nera Verzasca'
Found code 'PEA': 'Peacock (PEA) Goat' for Burren 'Peacock'
Found code 'SAA': 'Saanen (SAA) Goat' for Burren 'Saanen'
Found code 'SGB': 'Booted (SGB) Goat' for Burren 'Booted'
Found code 'TOG': 'Toggenburg (TOG) Goat' for Burren 'Toggenburg'


Ok, there's the 'CHA' code associated to a *Chamois Coloured*, which is a Breed I know as *Alpine (Camosciata delle Alpi)*

In [18]:
samples["code"].replace({"CHA": "ALP"}, inplace=True)
samples[samples["code"] == 'ALP'].head()

Unnamed: 0,code_number,original_id,alias,code
26,4,GFG01a,goat27,ALP
27,4,GFG03,goat28,ALP
28,4,GFG05a,goat29,ALP
29,4,GFG25a,goat30,ALP
30,4,GFG26,goat31,ALP


Ok, fix `code2breed` and test again:

In [19]:
if 'CHA' in code2breed:
    del(code2breed['CHA'])
code2breed['ALP'] = 'Alpine (Camosciata delle Alpi)'

for code, name in code2breed.items():
    count = Breed.objects.filter(code=code, species="Goat").count()
    if count != 0:
        breed = Breed.objects.get(code=code, species="Goat")
        print(f"Found code '{code}': '{breed}' for Burren '{name}'")

Found code 'APP': 'Appenzell (APP) Goat' for Burren 'Appenzell'
Found code 'GST': 'Grisons striped (GST) Goat' for Burren 'Grisons striped'
Found code 'TGR': 'Tessin grey (TGR) Goat' for Burren 'Tessin grey'
Found code 'VAG': 'Valais (VAG) Goat' for Burren 'Valais'
Found code 'NVE': 'Nera Verzasca (NVE) Goat' for Burren 'Nera Verzasca'
Found code 'PEA': 'Peacock (PEA) Goat' for Burren 'Peacock'
Found code 'SAA': 'Saanen (SAA) Goat' for Burren 'Saanen'
Found code 'SGB': 'Booted (SGB) Goat' for Burren 'Booted'
Found code 'TOG': 'Toggenburg (TOG) Goat' for Burren 'Toggenburg'
Found code 'ALP': 'Alpine (Camosciata delle Alpi) (ALP) Goat' for Burren 'Alpine (Camosciata delle Alpi)'


Ok, try to assign a breed name to every sample:

In [20]:
samples["breed"] = samples["code"].apply(lambda code: code2breed[code])
samples.head()

Unnamed: 0,code_number,original_id,alias,code,breed
0,2,BST01,goat1,GST,Grisons striped
1,2,BST02,goat2,GST,Grisons striped
2,2,BST06,goat3,GST,Grisons striped
3,2,BST07,goat4,GST,Grisons striped
4,2,BST13,goat5,GST,Grisons striped


Need to add a country column (to add breed with the script `import_breeds.py`)

In [21]:
samples["country"] = "Switzerland"
samples.head()

Unnamed: 0,code_number,original_id,alias,code,breed,country
0,2,BST01,goat1,GST,Grisons striped,Switzerland
1,2,BST02,goat2,GST,Grisons striped,Switzerland
2,2,BST06,goat3,GST,Grisons striped,Switzerland
3,2,BST07,goat4,GST,Grisons striped,Switzerland
4,2,BST13,goat5,GST,Grisons striped,Switzerland


This dataset comes with a phenotype table:

In [22]:
phenotypes = pd.read_excel(burren_dataset.working_dir / "doi_10.5061_dryad.q1cv6__v1/burren_phenotypes.xlsx")
# https://stackoverflow.com/a/45270483/4385116
phenotypes = phenotypes.applymap(lambda x: x.strip() if isinstance(x, str) else x)
phenotypes.head()

Unnamed: 0,Breed,Code,Coat color,Hair,Horns?,Size (♂/♀),Performance,Main / Rare?,Note
0,Appenzell,APP,Solid white,medium-length hair,no,85cm / 75cm,"Milk production (592kg, 2.88% fat, 2.63% pr...",Rare,"Average milk yield (kg), fat content (%) and p..."
1,Grisons striped,GST,"Black with white limbs, tail and stripes on th...",short smooth hair,yes,85cm / 75cm,"Robustness, Milk production (445kg, 3.44% f...",Rare,"Average milk yield (kg), fat content (%) and p..."
2,Tessin grey,TGR,Silver to slate grey,,yes,75-85cm / 70-80cm,"Robustness, Milk and meat production",Rare,
3,Alpine (Camosciata delle Alpi),ALP,"Brown with black markings on the head, legs a...",short hair,horned or polled,85cm / 75cm,"Milk production (621kg, 3.44% fat, 3.04% pr...",Main,"Average milk yield (kg), fat content (%) and p..."
4,Valais,VAG,"Frontquarters black or brown, hindquarters white",long fine hair,yes,85cm / 75cm,"Robustness, Meat production",Rare,


Fix column names:

In [23]:
phenotypes.columns = ["breed", "code", "coat_color", "hair", "horns", "size", "performance", "rare", "note"]
phenotypes.head()

Unnamed: 0,breed,code,coat_color,hair,horns,size,performance,rare,note
0,Appenzell,APP,Solid white,medium-length hair,no,85cm / 75cm,"Milk production (592kg, 2.88% fat, 2.63% pr...",Rare,"Average milk yield (kg), fat content (%) and p..."
1,Grisons striped,GST,"Black with white limbs, tail and stripes on th...",short smooth hair,yes,85cm / 75cm,"Robustness, Milk production (445kg, 3.44% f...",Rare,"Average milk yield (kg), fat content (%) and p..."
2,Tessin grey,TGR,Silver to slate grey,,yes,75-85cm / 70-80cm,"Robustness, Milk and meat production",Rare,
3,Alpine (Camosciata delle Alpi),ALP,"Brown with black markings on the head, legs a...",short hair,horned or polled,85cm / 75cm,"Milk production (621kg, 3.44% fat, 3.04% pr...",Main,"Average milk yield (kg), fat content (%) and p..."
4,Valais,VAG,"Frontquarters black or brown, hindquarters white",long fine hair,yes,85cm / 75cm,"Robustness, Meat production",Rare,


Try to symplify columns:

In [24]:
phenotypes["size_male"] = phenotypes["size"].apply(lambda string: string.split("/")[0].strip())
phenotypes["size_female"] = phenotypes["size"].apply(lambda string: string.split("/")[1].strip())
phenotypes["rare"] = phenotypes["rare"].str.strip().replace({"Rare": True, "Main": False})
phenotypes.head()

Unnamed: 0,breed,code,coat_color,hair,horns,size,performance,rare,note,size_male,size_female
0,Appenzell,APP,Solid white,medium-length hair,no,85cm / 75cm,"Milk production (592kg, 2.88% fat, 2.63% pr...",True,"Average milk yield (kg), fat content (%) and p...",85cm,75cm
1,Grisons striped,GST,"Black with white limbs, tail and stripes on th...",short smooth hair,yes,85cm / 75cm,"Robustness, Milk production (445kg, 3.44% f...",True,"Average milk yield (kg), fat content (%) and p...",85cm,75cm
2,Tessin grey,TGR,Silver to slate grey,,yes,75-85cm / 70-80cm,"Robustness, Milk and meat production",True,,75-85cm,70-80cm
3,Alpine (Camosciata delle Alpi),ALP,"Brown with black markings on the head, legs a...",short hair,horned or polled,85cm / 75cm,"Milk production (621kg, 3.44% fat, 3.04% pr...",False,"Average milk yield (kg), fat content (%) and p...",85cm,75cm
4,Valais,VAG,"Frontquarters black or brown, hindquarters white",long fine hair,yes,85cm / 75cm,"Robustness, Meat production",True,,85cm,75cm


save phenotypes and samples to a new file:

In [25]:
samples.to_excel("burren_samples_fix.xlsx")
phenotypes.to_excel("burren_phenotypes_fix.xlsx")

I need to update the FID in the plink files, cause otherwise I will not able to resolve breed-to-sample correspondance. Take the sample dataframe and change index relying on aliases:

In [26]:
samples = samples.set_index("alias")
samples.head()

Unnamed: 0_level_0,code_number,original_id,code,breed,country
alias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
goat1,2,BST01,GST,Grisons striped,Switzerland
goat2,2,BST02,GST,Grisons striped,Switzerland
goat3,2,BST06,GST,Grisons striped,Switzerland
goat4,2,BST07,GST,Grisons striped,Switzerland
goat5,2,BST13,GST,Grisons striped,Switzerland


Now iterate over the ped file replacing fid:

In [27]:
ped_file = pathlib.Path(plinkio.pedfile)
ped_fix = ped_file.stem + "_fix" + ped_file.suffix

with open(ped_fix, "w") as handle:
    writer = csv.writer(handle, delimiter=' ', lineterminator="\n")
    
    for line in plinkio.read_pedfile():
        sample = line[1]
        code = samples.loc[sample].code
        
        # update breed
        line[0] = code
        writer.writerow(line)

Data were written in `goat_data2_dryad_fix.ped`. This file need to be placed in the dataset