# New sheep background data
These are new sheep background data which integrate SMARTER-database

* [Welsh sheep breeds](#welsh_breeds)

In [1]:
import re
import logging
from collections import defaultdict

import pandas as pd
from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, Dataset
from src.features.plinkio import TextPlinkIO, CodingException
from src.data.common import WORKING_ASSEMBLIES

In [2]:
logger = logging.getLogger('src.features.plinkio')
logger.setLevel(logging.CRITICAL)

_ = global_connection()
OAR3 = WORKING_ASSEMBLIES["OAR3"]

In [3]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException:
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException:
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException:
            return False
        
class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

<a id='welsh_breeds'></a>
## Welsh sheep breeds
This dataset comes from [Beynon, Sarah E. et al. (2016)](https://bmcgenomdata.biomedcentral.com/articles/10.1186/s12863-015-0216-x), in which they genotyped 353 individuals from 18 native Welsh sheep breeds using the Illumina OvineSNP50 array:

In [4]:
welsh_dataset = Dataset.objects.get(file="Welsh_sheep_genotyping.zip")
welsh_dataset.contents

['genotyping data/',
 'genotyping data/WelshSheepBreeds2015.map',
 'genotyping data/WelshSheepBreeds2015.ped',
 'welsh-metadata.openrefine.tar.gz',
 'welsh-metadata.xlsx']

Ok open dataset and start exploring data:

In [5]:
prefix = str(welsh_dataset.working_dir / "genotyping data/WelshSheepBreeds2015")
plinkio = CustomTextPlinkIO(prefix=prefix, species=welsh_dataset.species, chip_name=welsh_dataset.chip_name)
plinkio.n_of_individuals = welsh_dataset.n_of_individuals

In [6]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(src_assembly=OAR3)

In [7]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 51135 of 51135 SNPs using 'name' (0.0% missing)


Is this dataset in top coordinates?

In [8]:
plinkio.is_top()

  0%|          | 0/353 [00:00<?, ?it/s]

True

Good. This file is already in *top* coordinates. What about breeds?

In [9]:
breeds = set()

for line in plinkio.read_pedfile():
    breed = line[0]
    if breed not in breeds:
        breeds.add(breed)
    
print(f"Got {breeds} breeds")

Got {'BlackWelshMountain', 'ClunForest', 'DolgellauWelshMountain', 'Balwen', 'Beulah', 'Llawenog', 'ImprovedWelshMountain', 'SouthWalesWelshMountain', 'BadgerFaced', 'Llanwenog', 'KerryHill', 'BrecknockHillCheviot', 'Lleyn', 'TalybontWelshMountain', 'LlandoveryWhiteFaced', 'HillRadnor', 'TregaronWelshMountain', 'WelshMountainHillFlock', 'HardySpeckledFaced'} breeds


Try to split breed names:

In [10]:
# https://stackoverflow.com/a/29920015
def camel_case_split(identifier):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

In [11]:
breeds2dict = {}

for breed in breeds:
    breeds2dict[breed] = " ".join(camel_case_split(breed))
    
print(breeds2dict)

{'BlackWelshMountain': 'Black Welsh Mountain', 'ClunForest': 'Clun Forest', 'DolgellauWelshMountain': 'Dolgellau Welsh Mountain', 'Balwen': 'Balwen', 'Beulah': 'Beulah', 'Llawenog': 'Llawenog', 'ImprovedWelshMountain': 'Improved Welsh Mountain', 'SouthWalesWelshMountain': 'South Wales Welsh Mountain', 'BadgerFaced': 'Badger Faced', 'Llanwenog': 'Llanwenog', 'KerryHill': 'Kerry Hill', 'BrecknockHillCheviot': 'Brecknock Hill Cheviot', 'Lleyn': 'Lleyn', 'TalybontWelshMountain': 'Talybont Welsh Mountain', 'LlandoveryWhiteFaced': 'Llandovery White Faced', 'HillRadnor': 'Hill Radnor', 'TregaronWelshMountain': 'Tregaron Welsh Mountain', 'WelshMountainHillFlock': 'Welsh Mountain Hill Flock', 'HardySpeckledFaced': 'Hardy Speckled Faced'}


Try to create sample metadata table:

In [12]:
data = defaultdict(list)

for line in plinkio.read_pedfile():
    data["breed"].append(breeds2dict[line[0]])
    data["fid"].append(line[0])
    data["original_id"].append(line[1])
    
welsh_metadata = pd.DataFrame(data=data)
welsh_metadata.to_excel("welsh_metadata.xlsx", index=False)

This file will be imported in openrefine in order to fix values and add a breed code for each breeds