# New sheep background data
These are new sheep background data which integrate SMARTER-database

* [Welsh sheep breeds](#welsh_breeds)
* [European mouflon and domestic sheep](#barbato_2017)

In [1]:
import re
import logging
from collections import defaultdict

import pandas as pd
from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, Dataset, SampleSheep
from src.features.plinkio import TextPlinkIO, BinaryPlinkIO, CodingException
from src.data.common import WORKING_ASSEMBLIES

In [2]:
logger = logging.getLogger('src.features.plinkio')
logger.setLevel(logging.CRITICAL)

_ = global_connection()
OAR3 = WORKING_ASSEMBLIES["OAR3"]

In [3]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException:
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException:
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException:
            return False
        
class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

class CustomBinaryPlinkIO(CustomMixin, BinaryPlinkIO):
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=len(self.plink_file.get_samples())):
            _ = self._process_genotypes(line, coding)
            
        return True

<a id='welsh_breeds'></a>
## Welsh sheep breeds
This dataset comes from [Beynon, Sarah E. et al. (2016)](https://bmcgenomdata.biomedcentral.com/articles/10.1186/s12863-015-0216-x), in which they genotyped 353 individuals from 18 native Welsh sheep breeds using the Illumina OvineSNP50 array:

In [4]:
welsh_dataset = Dataset.objects.get(file="Welsh_sheep_genotyping.zip")
welsh_dataset.contents

['genotyping data/',
 'genotyping data/WelshSheepBreeds2015.map',
 'genotyping data/WelshSheepBreeds2015.ped',
 'welsh-metadata.openrefine.tar.gz',
 'welsh-metadata.xlsx']

Ok open dataset and start exploring data:

In [5]:
prefix = str(welsh_dataset.working_dir / "genotyping data/WelshSheepBreeds2015")
plinkio = CustomTextPlinkIO(prefix=prefix, species=welsh_dataset.species, chip_name=welsh_dataset.chip_name)
plinkio.n_of_individuals = welsh_dataset.n_of_individuals

In [6]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(src_assembly=OAR3)

In [7]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 51135 of 51135 SNPs using 'name' (0.0% missing)


Is this dataset in top coordinates?

In [8]:
plinkio.is_top()

  0%|          | 0/353 [00:00<?, ?it/s]

True

Good. This file is already in *top* coordinates. What about breeds?

In [9]:
breeds = set()

for line in plinkio.read_pedfile():
    breed = line[0]
    if breed not in breeds:
        breeds.add(breed)
    
print(f"Got {breeds} breeds")

Got {'BadgerFaced', 'TregaronWelshMountain', 'TalybontWelshMountain', 'HardySpeckledFaced', 'SouthWalesWelshMountain', 'Beulah', 'BrecknockHillCheviot', 'Lleyn', 'Llanwenog', 'KerryHill', 'ClunForest', 'BlackWelshMountain', 'DolgellauWelshMountain', 'WelshMountainHillFlock', 'Balwen', 'ImprovedWelshMountain', 'LlandoveryWhiteFaced', 'Llawenog', 'HillRadnor'} breeds


Try to split breed names:

In [10]:
# https://stackoverflow.com/a/29920015
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

In [11]:
breeds2dict = {}

for breed in breeds:
    breeds2dict[breed] = " ".join(camel_case_split(breed))
    
print(breeds2dict)

{'BadgerFaced': 'Badger Faced', 'TregaronWelshMountain': 'Tregaron Welsh Mountain', 'TalybontWelshMountain': 'Talybont Welsh Mountain', 'HardySpeckledFaced': 'Hardy Speckled Faced', 'SouthWalesWelshMountain': 'South Wales Welsh Mountain', 'Beulah': 'Beulah', 'BrecknockHillCheviot': 'Brecknock Hill Cheviot', 'Lleyn': 'Lleyn', 'Llanwenog': 'Llanwenog', 'KerryHill': 'Kerry Hill', 'ClunForest': 'Clun Forest', 'BlackWelshMountain': 'Black Welsh Mountain', 'DolgellauWelshMountain': 'Dolgellau Welsh Mountain', 'WelshMountainHillFlock': 'Welsh Mountain Hill Flock', 'Balwen': 'Balwen', 'ImprovedWelshMountain': 'Improved Welsh Mountain', 'LlandoveryWhiteFaced': 'Llandovery White Faced', 'Llawenog': 'Llawenog', 'HillRadnor': 'Hill Radnor'}


Try to create sample metadata table:

In [12]:
data = defaultdict(list)

for line in plinkio.read_pedfile():
    data["breed"].append(breeds2dict[line[0]])
    data["fid"].append(line[0])
    data["original_id"].append(line[1])
    
welsh_metadata = pd.DataFrame(data=data)
welsh_metadata.to_excel("welsh_metadata.xlsx", index=False)

This file will be imported in openrefine in order to fix values and add a breed code for each breeds

<a id='barbato_2017'></a>
## European mouflon and domestic sheep
This dataset comes from [Barbato M, Hailer F, Orozco-terWengel P, et al](https://www.nature.com/articles/s41598-017-07382-7) and have data from *muflon* and sheep

In [13]:
barbato_2017 = Dataset.objects.get(file="41598_2017_7382_MOESM2_ESM.zip")
barbato_2017.contents

['41598_2017_7382_MOESM2_ESM/',
 '41598_2017_7382_MOESM2_ESM/CIWI.R',
 '41598_2017_7382_MOESM2_ESM/Barbato_2016.bim',
 '41598_2017_7382_MOESM2_ESM/Barbato_2016.fam',
 '41598_2017_7382_MOESM2_ESM/PCAdmix_prettify.cpp',
 '41598_2017_7382_MOESM2_ESM/barbato_muflon.xlsx',
 '41598_2017_7382_MOESM2_ESM/barbato_sheep.xlsx',
 '41598_2017_7382_MOESM2_ESM/.Rhistory',
 '41598_2017_7382_MOESM2_ESM/ovis.sample_index',
 '41598_2017_7382_MOESM2_ESM/Barbato_2016.bed',
 '41598_2017_7382_MOESM2_ESM/barbato_muflon_fix.xlsx',
 '41598_2017_7382_MOESM2_ESM/barbato_muflon_metadata.xlsx']

Ok open dataset and start exploring data:

In [14]:
prefix = str(barbato_2017.working_dir / "41598_2017_7382_MOESM2_ESM/Barbato_2016")
plinkio = CustomBinaryPlinkIO(prefix=prefix, species=barbato_2017.species, chip_name=barbato_2017.chip_name)
plinkio.n_of_individuals = 422

In [15]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(src_assembly=OAR3)

In [16]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 36961 of 36961 SNPs using 'name' (0.0% missing)


Is this dataset in top coordinates?

In [17]:
plinkio.is_top()

  0%|          | 0/422 [00:00<?, ?it/s]

True

The entire dataset is in TOP coordinates. It's composed by muflon and by sheep samples. First start by considering muflon samples:

In [18]:
muflon = pd.read_excel(barbato_2017.working_dir / "41598_2017_7382_MOESM2_ESM/barbato_muflon.xlsx")
muflon

Unnamed: 0,Breed/population,Acronym,Origin,Number,Ne,F,Source,Ho,Ho (SD)
0,Sardinian mouflon,MSar1,Sardinia,19,261.0,0.45,This study,0.22,0.19
1,Sardinian mouflon,MSar2,Sardinia,8,130.0,0.46,This study,0.22,0.24
2,Sardinian mouflon,MSar3,Sardinia,28,273.0,0.16,KJa,0.34,0.19
3,Spanish mouflon,MSpa,Spain,21,96.0,0.51,KJa,0.2,0.19
4,Hungarian mouflon,MHun,Hungary,8,282.0,0.42,This study,0.24,0.21
5,Corsican mouflon,MCor,Corsica,3,259.0,0.41,This study,0.24,0.27
6,Cypriot mouflon,MCyp,Cyprus,3,244.0,0.78,This study,0.09,0.2
7,Iranian mouflon,MIra,Iran,2,,0.35,NGb,0.25,0.31


Replace origin with the proper country:

In [19]:
muflon.replace({"Origin": {"Sardinia": "Italy", "Corsica": "France", "Iran": "Iran, Islamic Republic of"}}, inplace=True)
muflon

Unnamed: 0,Breed/population,Acronym,Origin,Number,Ne,F,Source,Ho,Ho (SD)
0,Sardinian mouflon,MSar1,Italy,19,261.0,0.45,This study,0.22,0.19
1,Sardinian mouflon,MSar2,Italy,8,130.0,0.46,This study,0.22,0.24
2,Sardinian mouflon,MSar3,Italy,28,273.0,0.16,KJa,0.34,0.19
3,Spanish mouflon,MSpa,Spain,21,96.0,0.51,KJa,0.2,0.19
4,Hungarian mouflon,MHun,Hungary,8,282.0,0.42,This study,0.24,0.21
5,Corsican mouflon,MCor,France,3,259.0,0.41,This study,0.24,0.27
6,Cypriot mouflon,MCyp,Cyprus,3,244.0,0.78,This study,0.09,0.2
7,Iranian mouflon,MIra,"Iran, Islamic Republic of",2,,0.35,NGb,0.25,0.31


Add species:

In [20]:
muflon["Species"] = "Ovis aries musimon"
muflon.at[6,"Species"] = "Ovis orientalis ophion"
muflon.at[7,"Species"] = "Ovis orientalis"
muflon

Unnamed: 0,Breed/population,Acronym,Origin,Number,Ne,F,Source,Ho,Ho (SD),Species
0,Sardinian mouflon,MSar1,Italy,19,261.0,0.45,This study,0.22,0.19,Ovis aries musimon
1,Sardinian mouflon,MSar2,Italy,8,130.0,0.46,This study,0.22,0.24,Ovis aries musimon
2,Sardinian mouflon,MSar3,Italy,28,273.0,0.16,KJa,0.34,0.19,Ovis aries musimon
3,Spanish mouflon,MSpa,Spain,21,96.0,0.51,KJa,0.2,0.19,Ovis aries musimon
4,Hungarian mouflon,MHun,Hungary,8,282.0,0.42,This study,0.24,0.21,Ovis aries musimon
5,Corsican mouflon,MCor,France,3,259.0,0.41,This study,0.24,0.27,Ovis aries musimon
6,Cypriot mouflon,MCyp,Cyprus,3,244.0,0.78,This study,0.09,0.2,Ovis orientalis ophion
7,Iranian mouflon,MIra,"Iran, Islamic Republic of",2,,0.35,NGb,0.25,0.31,Ovis orientalis


Try to define a breed code to be used in database:

In [21]:
muflon["code"] = muflon["Acronym"].apply(lambda value: value[:3].upper())

rename columns for simplicity:

In [22]:
muflon.rename(columns={"Breed/population": "Breed"}, inplace=True)

Need to define a metadata table in which specify the samples to add:

In [23]:
tmp = defaultdict(list)
for fid, iid, *_ in plinkio.read_pedfile():
    tmp["code"].append(fid)
    tmp["original_id"].append(iid)
            
tmp = pd.DataFrame(data=tmp)
tmp.head()

Unnamed: 0,code,original_id
0,MCyp,92_MufloneCy
1,MCyp,101_MufloneCy
2,MCyp,113_MufloneCy
3,MIra,MIra-C3-0001
4,MIra,MIra-D6-0003


In [24]:
muflon_metadata = pd.merge(tmp, muflon, left_on="code", right_on="Acronym")
muflon_metadata.rename(columns={"Origin": "country", "code_x": "fid", "code_y": "code"}, inplace=True)
muflon_metadata

Unnamed: 0,fid,original_id,Breed,Acronym,country,Number,Ne,F,Source,Ho,Ho (SD),Species,code
0,MCyp,92_MufloneCy,Cypriot mouflon,MCyp,Cyprus,3,244.0,0.78,This study,0.09,0.20,Ovis orientalis ophion,MCY
1,MCyp,101_MufloneCy,Cypriot mouflon,MCyp,Cyprus,3,244.0,0.78,This study,0.09,0.20,Ovis orientalis ophion,MCY
2,MCyp,113_MufloneCy,Cypriot mouflon,MCyp,Cyprus,3,244.0,0.78,This study,0.09,0.20,Ovis orientalis ophion,MCY
3,MIra,MIra-C3-0001,Iranian mouflon,MIra,"Iran, Islamic Republic of",2,,0.35,NGb,0.25,0.31,Ovis orientalis,MIR
4,MIra,MIra-D6-0003,Iranian mouflon,MIra,"Iran, Islamic Republic of",2,,0.35,NGb,0.25,0.31,Ovis orientalis,MIR
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,MSar3,SMF24,Sardinian mouflon,MSar3,Italy,28,273.0,0.16,KJa,0.34,0.19,Ovis aries musimon,MSA
88,MSar3,SMF25,Sardinian mouflon,MSar3,Italy,28,273.0,0.16,KJa,0.34,0.19,Ovis aries musimon,MSA
89,MSar3,SMF26,Sardinian mouflon,MSar3,Italy,28,273.0,0.16,KJa,0.34,0.19,Ovis aries musimon,MSA
90,MSar3,SMF27,Sardinian mouflon,MSar3,Italy,28,273.0,0.16,KJa,0.34,0.19,Ovis aries musimon,MSA


There are also additional metadata coming from nextgen for the two iranian samples, which have different names between nextgen and this dataset:

In [25]:
nextgen = pd.read_table(barbato_2017.working_dir / "41598_2017_7382_MOESM2_ESM/ovis.sample_index", skiprows=10)
nextgen.rename(columns={"#sample_name": "alias"}, inplace=True)
nextgen = nextgen[nextgen["alias"].str.contains("C3-0001|D6-0003")].copy()
nextgen

Unnamed: 0,alias,sample_accession,biosamples_id,sample_provider,species,taxonomy_id,breed,country,closest_city,closest_locality,estimated_age_months,sex,longitude,latitude,sampling_date,photographs
20,IROO-C3-0001,ERS154526,SAMEA2012637,NEXTGEN,Ovis orientalis,469796,.,Iran,Marand,Marakan,60,male,45.385152,38.931678,2011-01-26,.
23,IROO-D6-0003,ERS154528,SAMEA2012639,NEXTGEN,Ovis orientalis,469796,.,Iran,Urmia,Kaboudan,.,male,45.599,37.49,2011-02-22,.


Linking ids between datasets:

In [26]:
nextgen2barbato = {}
for sample in plinkio.get_samples():
    if 'C3-0001' in sample or 'D6-0003' in sample:
        nextgen2barbato[sample.replace('MIra', 'IROO')] = sample
nextgen["original_id"] = nextgen["alias"].apply(lambda value: nextgen2barbato[value])
nextgen

Unnamed: 0,alias,sample_accession,biosamples_id,sample_provider,species,taxonomy_id,breed,country,closest_city,closest_locality,estimated_age_months,sex,longitude,latitude,sampling_date,photographs,original_id
20,IROO-C3-0001,ERS154526,SAMEA2012637,NEXTGEN,Ovis orientalis,469796,.,Iran,Marand,Marakan,60,male,45.385152,38.931678,2011-01-26,.,MIra-C3-0001
23,IROO-D6-0003,ERS154528,SAMEA2012639,NEXTGEN,Ovis orientalis,469796,.,Iran,Urmia,Kaboudan,.,male,45.599,37.49,2011-02-22,.,MIra-D6-0003


Join metadata table:

In [27]:
muflon_metadata = pd.merge(muflon_metadata, nextgen, on="original_id", how="left")

Now write them into metadata files:

In [28]:
muflon_metadata.to_excel("barbato_muflon_metadata.xlsx", index=False)

Now it's time to take a look to sheep samples:

In [29]:
sheep = pd.read_excel(barbato_2017.working_dir / "41598_2017_7382_MOESM2_ESM/barbato_sheep.xlsx")
sheep

Unnamed: 0,Breed/population,Acronym,Origin,Number,Ho (SD),Ne,F,Source,Ho
0,Altamurana,ALT,Italy,24,0.16,628,0.06,KJa,0.37
1,Australian Merino,ASM,Spain,24,0.15,920,0.06,KJa,0.37
2,Castellana,CAS,Spain,23,0.16,813,0.02,KJa,0.38
3,Chios,CHI,Greece,23,0.17,391,0.15,KJa,0.33
4,Churra,CHU,Spain,24,0.16,617,0.05,KJa,0.37
5,Comisana,COM,Italy,24,0.16,1028,0.03,KJa,0.38
6,Cyprus Fat Tail,CFT,Cyprus,24,0.19,186,0.13,KJa,0.34
7,Iranian sheep,IRS,Iran,6,0.22,412,0.05,NGb,0.37
8,Milk Lacaune,LAC,France,24,0.16,607,0.06,KJa,0.37
9,Nera di Arbus sheep,SAB,Sardinia,20,0.18,366,0.08,KJa,0.36


Replace origin with the proper country:

In [30]:
sheep.replace({"Origin": {"Sardinia": "Italy", "Corsica": "France", "Iran": "Iran, Islamic Republic of"}}, inplace=True)
sheep

Unnamed: 0,Breed/population,Acronym,Origin,Number,Ho (SD),Ne,F,Source,Ho
0,Altamurana,ALT,Italy,24,0.16,628,0.06,KJa,0.37
1,Australian Merino,ASM,Spain,24,0.15,920,0.06,KJa,0.37
2,Castellana,CAS,Spain,23,0.16,813,0.02,KJa,0.38
3,Chios,CHI,Greece,23,0.17,391,0.15,KJa,0.33
4,Churra,CHU,Spain,24,0.16,617,0.05,KJa,0.37
5,Comisana,COM,Italy,24,0.16,1028,0.03,KJa,0.38
6,Cyprus Fat Tail,CFT,Cyprus,24,0.19,186,0.13,KJa,0.34
7,Iranian sheep,IRS,"Iran, Islamic Republic of",6,0.22,412,0.05,NGb,0.37
8,Milk Lacaune,LAC,France,24,0.16,607,0.06,KJa,0.37
9,Nera di Arbus sheep,SAB,Italy,20,0.18,366,0.08,KJa,0.36


Many of these samples seems to be already in smarter database as *sheep hapmap data*. Try to filter out the samples I have:

In [31]:
sheep_hapmap = Dataset.objects.get(file="ovine_SNP50HapMap_data.zip")

In [32]:
acronyms = sheep["Acronym"].values
tmp = defaultdict(list)
for fid, iid, *_ in plinkio.read_pedfile():
    if fid in acronyms:
        if SampleSheep.objects.filter(dataset=sheep_hapmap, original_id=iid).count() == 0:
            tmp["code"].append(fid)
            tmp["original_id"].append(iid)
            
tmp = pd.DataFrame(data=tmp)
tmp.head()

Unnamed: 0,code,original_id
0,SAR,7_C5-1999-A
1,SAR,8_C5-2000-A
2,SAR,9_C5-2001-A
3,SAR,58_PecoraS
4,SAR,59_PecoraS


In [33]:
sheep_metadata = pd.merge(tmp, sheep, left_on="code", right_on="Acronym")[["original_id", "Breed/population", "code", "Origin"]]
sheep_metadata.rename(columns={"Breed/population": "breed", "Origin": "country"}, inplace=True)
sheep_metadata

Unnamed: 0,original_id,breed,code,country
0,7_C5-1999-A,Sarda sheep,SAR,Italy
1,8_C5-2000-A,Sarda sheep,SAR,Italy
2,9_C5-2001-A,Sarda sheep,SAR,Italy
3,58_PecoraS,Sarda sheep,SAR,Italy
4,59_PecoraS,Sarda sheep,SAR,Italy
5,60_PecoraS,Sarda sheep,SAR,Italy
6,61_PecoraS,Sarda sheep,SAR,Italy
7,67_PecoraS,Sarda sheep,SAR,Italy
8,70_PecoraS,Sarda sheep,SAR,Italy
9,91_PecoraS,Sarda sheep,SAR,Italy


There are also additional metadata coming from nextgen for the some iranian samples:

In [34]:
nextgen = pd.read_table(barbato_2017.working_dir / "41598_2017_7382_MOESM2_ESM/ovis.sample_index", skiprows=10)
nextgen.rename(columns={"#sample_name": "sample_name"}, inplace=True)
sheep_metadata = pd.merge(sheep_metadata, nextgen, left_on="original_id", right_on="sample_name", how="left")
sheep_metadata.tail()

Unnamed: 0,original_id,breed_x,code,country_x,sample_name,sample_accession,biosamples_id,sample_provider,species,taxonomy_id,breed_y,country_y,closest_city,closest_locality,estimated_age_months,sex,longitude,latitude,sampling_date,photographs
19,IROA-B4-5190,Iranian sheep,IRS,"Iran, Islamic Republic of",IROA-B4-5190,ERS154865,SAMEA2012929,NEXTGEN,Ovis aries,9940.0,.,Iran,salmas,salmas,36,female,44.838767,38.153121,2011-10-27,"IROA-B4-5190c.JPG,IROA-B4-5190a.JPG,IROA-B4-51..."
20,IROA-B5-5295,Iranian sheep,IRS,"Iran, Islamic Republic of",IROA-B5-5295,ERS154863,SAMEA2012927,NEXTGEN,Ovis aries,9940.0,.,Iran,urumie,urumie,24,male,44.952849,37.972674,2011-10-26,"IROA-B5-5295a.JPG,IROA-B5-5295c.JPG,IROA-B5-52..."
21,IROA-D6-5152,Iranian sheep,IRS,"Iran, Islamic Republic of",IROA-D6-5152,ERS154866,SAMEA2012930,NEXTGEN,Ovis aries,9940.0,.,Iran,ajabshir,ajabshir,24,female,45.877259,37.470274,2011-10-25,"IROA-D6-5152a.JPG,IROA-D6-5152c.JPG,IROA-D6-51..."
22,IROA-F3-5142,Iranian sheep,IRS,"Iran, Islamic Republic of",IROA-F3-5142,ERS154867,SAMEA2012931,NEXTGEN,Ovis aries,9940.0,.,Iran,ahar,ahar,48,female,46.85095,38.52272,2011-11-15,"IROA-F3-5142c.JPG,IROA-F3-5142a.JPG,IROA-F3-51..."
23,IROA-G4-5205,Iranian sheep,IRS,"Iran, Islamic Republic of",IROA-G4-5205,ERS154862,SAMEA2012926,NEXTGEN,Ovis aries,9940.0,.,Iran,niaz,Meshkin shahr,84,female,47.430937,38.3934,2011-11-14,"IROA-G4-5205c.JPG,IROA-G4-5205a.JPG,IROA-G4-52..."


Write metadata to file:

In [35]:
sheep_metadata.to_excel("barbato_sheep_metadata.xlsx", index=False)