# High density genotypes of French Sheep populations
describing `High density genotypes of French Sheep populations.zip`

In [1]:
import io
import csv
import itertools

from collections import Counter
from pathlib import Path

import pandas as pd
from plinkio import plinkfile

from src.features.smarterdb import VariantSheep, global_connection, Dataset

get dataset informations relying on project classes

In [2]:
global_connection()
dataset = Dataset.objects.filter(file="High density genotypes of French Sheep populations.zip").get()

Display directory content (of selected dataset):

In [3]:
for item in dataset.working_dir.iterdir():
    print(item)

/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.bed
/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/info.txt
/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.bim
/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/frenchsheep_HD.fam
/home/paolo/Projects/SMARTER-database/data/interim/604f75a61a08c53cebd09b58/Populations_infos.xlsx


In [4]:
with open(dataset.working_dir / "info.txt") as handle:
    print(handle.read())

https://zenodo.org/record/237116#.XlUezRdG3OQ


This dataset is in plink binary format, with population info in a xlsx file. Data were downloaded from a zenodo project

In [5]:
with open(dataset.working_dir / "Populations_infos.xlsx", "rb") as handle:
    infos = pd.read_excel(handle)
infos.head()

Unnamed: 0,Code,Population Name,Link,Latitude,Longitude,Color,POP_GROUP_CODE,POP_GROUP_NAME,Unnamed: 8
0,BER,Berrichon du Cher,http://en.france-genetique-elevage.org/Berrich...,47.081012,2.398782,,NORTH,NORTH,
1,BMC,Blanc du Massif Central,http://en.france-genetique-elevage.org/Blanche...,44.517611,3.501873,,SOUTH,SOUTH,
2,CDL,Causses du Lot,http://en.france-genetique-elevage.org/Causse-...,44.799383,1.617901,,SOUTH,SOUTH,
3,CHA,Mouton Charollais,http://en.france-genetique-elevage.org/Charoll...,46.435442,4.277004,,NORTH,NORTH,
4,CHR,Charmoise,http://en.france-genetique-elevage.org/Charmoi...,47.390249,1.254324,,NORTH,NORTH,


Data have informations on breed (with code) and GPS coordinates. What about SNPs coordinates? open the `.bim` file (which is in txt format) and test SNPs

In [6]:
counter = Counter()
with open(dataset.working_dir / "frenchsheep_HD.bim") as f:
    reader = csv.reader(f, delimiter="\t")
    print("Reading first 200 lines of bim file")
    for line in itertools.islice(reader, 200):
        qs = VariantSheep.objects(name=line[1])
        if qs.count() > 0:
            variant = qs.get()
            counter.update([variant.name])
            location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
            if line[0] != location.chrom or int(line[3]) != location.position:
                print(f"snp {line[1]} with different positions: {line[0]}:{line[3]}<>{location.chrom}:{location.position}")
                

Reading first 200 lines of bim file


Coordinates seems to match the latest *SNPchiMp v.3* database

## Getting info on breeds

Open genotypes with `plinkio` and get information on samples:

In [7]:
plink_file = plinkfile.open( str(dataset.working_dir / "frenchsheep_HD") )
sample_list = plink_file.get_samples( )

Get `fid` from samples

In [8]:
fids = set([sample.fid for sample in sample_list])
fids = pd.Series(list(fids), name="Code")

In [9]:
merged = pd.merge(fids,infos, on="Code")
merged.loc[:, ["Code", "Population Name"]]

Unnamed: 0,Code,Population Name
0,CDL,Causses du Lot
1,TEX,Texel
2,SUF,Suffolk
3,CHR,Charmoise
4,ROM,Romanov
5,COR,Corse
6,RWE,Rouge de l'Ouest
7,TAR,Tarasconnaise
8,LAC,Lacaune (milk)
9,RAV,Rava
