# Hungarian sheeps
Try to describe hungary sheep dataset

In [1]:
import logging

import pandas as pd
import geopy

from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, Dataset
from src.features.plinkio import TextPlinkIO, CodingException
from src.data.common import WORKING_ASSEMBLIES

In [2]:
logger = logging.getLogger('src.features.plinkio')
logger.setLevel(logging.CRITICAL)

_ = global_connection()
OAR3 = WORKING_ASSEMBLIES["OAR3"]

In [3]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException:
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException:
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException:
            return False
        
class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

Read dataset from database:

In [4]:
hungarian_dataset = Dataset.objects.get(file="NativesheepBreeds_Hu.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(hungarian_dataset.working_dir / "NativesheepBreeds_Hu/NativeSheepGenotypes"), 
    species=hungarian_dataset.species, 
    chip_name=hungarian_dataset.chip_name)
plinkio.n_of_individuals = hungarian_dataset.n_of_individuals

In [5]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    **OAR3._asdict()
)

In [6]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs using 'name' ({perc_missing}% missing)")

I can retrieve 53446 of 53516 SNPs using 'name' (0.13% missing)


Is this dataset in top coordinates?

In [7]:
plinkio.is_top()

  0%|          | 0/259 [00:00<?, ?it/s]

Error for SNP 1:DU415336_399.1: G/G <> A/C


False

So, not in TOP. Is this in forward?

In [8]:
plinkio.is_forward()

  0%|          | 0/259 [00:00<?, ?it/s]

True

Ok, it's in forward coordinates. What about the new breeds I received?

In [9]:
breeds = set()

for line in plinkio.read_pedfile():
    breed = line[0]
    if breed not in breeds:
        breeds.add(breed)
    
print(f"Got {breeds} breeds")

Got {'Suffolk', 'Merino', 'Tsigai', 'I.France', 'R.Tsigai', 'Tetra', 'W.Dorper', 'Dorper', 'Racka', 'Turcana'} breeds


Some breeds are completely new and need to be added into database

## Hungarian metadata

I've received a metadata file with GPS coordinates. Breed names are also recorded using full name. However, GPS coordinates are tracked in *degrees, minutes, seconds* in *Nord-East* format, so I need to transform them using `geopy` into float representation. Open dataset and read informations:

In [10]:
phenotype_dataset = Dataset.objects.get(country="Hungary", type_="phenotypes")
with open(phenotype_dataset.working_dir / "NativeSheep GPS.xlsx", "rb") as handle:
    info = pd.read_excel(handle)
info.head()

Unnamed: 0,Country,Breed,Individua_ID,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,Hungary,Tsigai,1458,47°34′36″N,21°34′46″E,,,,,,,
1,Hungary,Tsigai,1470,47°34′36″N,21°34′46″E,,,,,,,
2,Hungary,Tsigai,1482,47°34′36″N,21°34′46″E,,,,,,,
3,Hungary,Tsigai,1485,47°34′36″N,21°34′46″E,,,,,,,
4,Hungary,Tsigai,1489,47°34′36″N,21°34′46″E,,,,,,,


In [11]:
info.dropna(axis=1, inplace=True)
info.columns = ['country', 'breed', 'original_id', 'north', 'east']
info.head()

Unnamed: 0,country,breed,original_id,north,east
0,Hungary,Tsigai,1458,47°34′36″N,21°34′46″E
1,Hungary,Tsigai,1470,47°34′36″N,21°34′46″E
2,Hungary,Tsigai,1482,47°34′36″N,21°34′46″E
3,Hungary,Tsigai,1485,47°34′36″N,21°34′46″E
4,Hungary,Tsigai,1489,47°34′36″N,21°34′46″E


It's time to convert coordinates using `geopy`:

In [12]:
def degrees2float(nord: str, east: str):
    tmp = f"{nord} {east}"
    point = geopy.point.Point(tmp)
    return point

In [13]:
points = info[['north', 'east']].apply(lambda df: degrees2float(df['north'], df['east']), axis=1)
info["latitude"] = points.apply(lambda point: point.latitude)
info["longitude"] = points.apply(lambda point: point.longitude)
info.head()

Unnamed: 0,country,breed,original_id,north,east,latitude,longitude
0,Hungary,Tsigai,1458,47°34′36″N,21°34′46″E,47.576667,21.579444
1,Hungary,Tsigai,1470,47°34′36″N,21°34′46″E,47.576667,21.579444
2,Hungary,Tsigai,1482,47°34′36″N,21°34′46″E,47.576667,21.579444
3,Hungary,Tsigai,1485,47°34′36″N,21°34′46″E,47.576667,21.579444
4,Hungary,Tsigai,1489,47°34′36″N,21°34′46″E,47.576667,21.579444


Ok, now test that the original id I have is also in genotype files:

In [14]:
samples = [line[1] for line in plinkio.read_pedfile()]
print(f"Got {len(samples)} from plink file and {info.shape[0]} from xlsx file")

Got 259 from plink file and 259 from xlsx file


In [15]:
samples_set = set(samples)
info['in_ped'] = info['original_id'].apply(lambda sample: str(sample) in samples_set)
info[info['in_ped'] == False]

Unnamed: 0,country,breed,original_id,north,east,latitude,longitude,in_ped
195,Hungary,Hortobágy Racka,44,46°50′55″N,17°36′22″E,46.848611,17.606111,False
197,Hungary,Suffolk,114,48°23′57″N,20°50′46″E,48.399167,20.846111,False


Ok try to change the `original_id` of these two samples, and put a `0` in front of them to see if I match a sample name:

In [16]:
info.at[195, 'original_id'] = '044'
info.at[197, 'original_id'] = '0114'
info['in_ped'] = info['original_id'].apply(lambda sample: str(sample) in samples_set)
print(f"Got {len(info[info['in_ped'] == False].index)} mismatches between samples")

Got 0 mismatches between samples


So, now all the samples id corresponds between data files. I can remove the column I don't need:

In [17]:
info.drop('in_ped', axis=1, inplace=True)

I need to check breed code and verify that they are not already used in my database, otherwise I need to define *FID aliases:

In [18]:
samples2fid = { line[1]: line[0] for line in plinkio.read_pedfile()}
set(samples2fid.values())

{'Dorper',
 'I.France',
 'Merino',
 'R.Tsigai',
 'Racka',
 'Suffolk',
 'Tetra',
 'Tsigai',
 'Turcana',
 'W.Dorper'}

Ok and use full name for *White Dorper* and fix *Île de France*:

In [19]:
info.replace({'W.Dorper': 'White Dorper'}, inplace=True)
info.replace({'Ile de France': 'Île de France'}, inplace=True)

In [20]:
info["breed"].unique()

array(['Tsigai', 'Merino', 'Dorper', 'White Dorper', 'Bábolna Tetra',
       'Île de France', 'Suffolk', 'Hortobágy Racka', 'Rusty Tsigai',
       'Turcana'], dtype=object)

Ok define another dictionary to define breed to fid and code conversion:

In [21]:
breed2code = {'Tsigai': ('Tsigai', 'TSI'), 'Merino': ('Merino', 'MER'), 'Dorper': ('Dorper', 'DRP'), 'White Dorper': ('W.Dorper', 'WDR'), 
              'Bábolna Tetra': ('Tetra', 'BAT'), 'Île de France': ('I.France', 'IDF'), 'Suffolk': ('Suffolk', 'SUF'),
              'Hortobágy Racka': ('Racka', 'HRR'), 'Rusty Tsigai': ('R.Tsigai', 'RST'), 'Turcana': ('Turcana', 'TRC')}

In [22]:
info['fid'] = info["breed"].apply(lambda breed: breed2code[breed][0])
info['code'] = info["breed"].apply(lambda breed: breed2code[breed][1])
info.head()

Unnamed: 0,country,breed,original_id,north,east,latitude,longitude,fid,code
0,Hungary,Tsigai,1458,47°34′36″N,21°34′46″E,47.576667,21.579444,Tsigai,TSI
1,Hungary,Tsigai,1470,47°34′36″N,21°34′46″E,47.576667,21.579444,Tsigai,TSI
2,Hungary,Tsigai,1482,47°34′36″N,21°34′46″E,47.576667,21.579444,Tsigai,TSI
3,Hungary,Tsigai,1485,47°34′36″N,21°34′46″E,47.576667,21.579444,Tsigai,TSI
4,Hungary,Tsigai,1489,47°34′36″N,21°34′46″E,47.576667,21.579444,Tsigai,TSI


Ok, time to write metadata in file:

In [23]:
outfile = "nativesheeps_hu_fixed.xlsx"
info.to_excel(outfile, index=False)

This fixed metadata file will be placed in phenotype archive