# Hungarian sheeps
Try to describe hungary sheep dataset

In [1]:
import logging

from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, Dataset
from src.features.plinkio import TextPlinkIO, CodingException
from src.data.common import WORKING_ASSEMBLIES

In [2]:
logger = logging.getLogger('src.features.plinkio')
logger.setLevel(logging.CRITICAL)

_ = global_connection()
OAR3 = WORKING_ASSEMBLIES["OAR3"]

In [3]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException:
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException:
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException:
            return False
        
class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

Read dataset from database:

In [4]:
hungarian_dataset = Dataset.objects.get(file="NativesheepBreeds_Hu.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(hungarian_dataset.working_dir / "NativesheepBreeds_Hu/NativeSheepGenotypes"), 
    species=hungarian_dataset.species, 
    chip_name=hungarian_dataset.chip_name)
plinkio.n_of_individuals = hungarian_dataset.n_of_individuals

In [5]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    **OAR3._asdict()
)

In [6]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 53446 of 53516 SNPs using 'name' (0.13% missing)


Is this dataset in top coordinates?

In [7]:
plinkio.is_top()

  0%|          | 0/259 [00:00<?, ?it/s]

Error for SNP 1:DU415336_399.1: G/G <> A/C


False

So, not in TOP. Is this in forward?

In [8]:
plinkio.is_forward()

  0%|          | 0/259 [00:00<?, ?it/s]

True

Ok, it's in forward coordinates. What about the new breeds I received?

In [9]:
breeds = set()

for line in plinkio.read_pedfile():
    breed = line[0]
    if breed not in breeds:
        breeds.add(breed)
    
print(f"Got {breeds} breeds")

Got {'Tsigai', 'Suffolk', 'R.Tsigai', 'I.France', 'W.Dorper', 'Turcana', 'Dorper', 'Tetra', 'Merino', 'Racka'} breeds


Some breeds are completely new and need to be added into database