# High density genotypes of French Sheep populations
describing `High density genotypes of French Sheep populations.zip`

In [1]:
import io
import csv
import itertools

from collections import Counter
from zipfile import ZipFile
from pathlib import Path

import pandas as pd

from src.features.smarterdb import VariantSheep, global_connection

In [2]:
project_dir = Path.cwd().parents[1]
datafile = project_dir / "data/raw/background/High density genotypes of French Sheep populations.zip"

In [3]:
handle = ZipFile(datafile)
handle.printdir()

File Name                                             Modified             Size
frenchsheep_HD.bed                             2020-03-03 11:19:06     71783931
frenchsheep_HD.bim                             2020-03-03 11:16:56     19346943
frenchsheep_HD.fam                             2020-03-03 11:16:22        10062
info.txt                                       2020-03-10 14:32:58           45
Populations_infos.xlsx                         2020-03-10 14:33:34        12465


In [4]:
with io.TextIOWrapper(handle.open("info.txt"), encoding="utf-8") as f:
    print(f.read())

https://zenodo.org/record/237116#.XlUezRdG3OQ


This dataset is in plink binary format, with population info in a xlsx file. Data were downloaded from a zenodo project

In [5]:
with handle.open("Populations_infos.xlsx") as f:
    infos = pd.read_excel(f)
infos.head()

Unnamed: 0,Code,Population Name,Link,Latitude,Longitude,Color,POP_GROUP_CODE,POP_GROUP_NAME,Unnamed: 8
0,BER,Berrichon du Cher,http://en.france-genetique-elevage.org/Berrich...,47.081012,2.398782,,NORTH,NORTH,
1,BMC,Blanc du Massif Central,http://en.france-genetique-elevage.org/Blanche...,44.517611,3.501873,,SOUTH,SOUTH,
2,CDL,Causses du Lot,http://en.france-genetique-elevage.org/Causse-...,44.799383,1.617901,,SOUTH,SOUTH,
3,CHA,Mouton Charollais,http://en.france-genetique-elevage.org/Charoll...,46.435442,4.277004,,NORTH,NORTH,
4,CHR,Charmoise,http://en.france-genetique-elevage.org/Charmoi...,47.390249,1.254324,,NORTH,NORTH,


Data have informations on breed (with code) and GPS coordinates. What about SNPs coordinates? open the `.bim` file (which is in txt format) and test SNPs

In [6]:
global_connection()
counter = Counter()
with io.TextIOWrapper(handle.open("frenchsheep_HD.bim"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    print("Reading first 200 lines of bim file")
    for line in itertools.islice(reader, 200):
        qs = VariantSheep.objects(name=line[1])
        if qs.count() > 0:
            variant = qs.get()
            counter.update([variant.name])
            location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
            if line[0] != location.chrom or int(line[3]) != location.position:
                print(f"snp {line[1]} with different positions: {line[0]}:{line[3]}<>{location.chrom}:{location.position}")
                
print(f"SNPs found in db: {','.join(counter.keys())}")

Reading first 200 lines of bim file
SNPs found in db: s64199.1,OAR19_64803054.1,DU281551_498.1,s18939.1,OAR1_88143.1,s36301.1,s34880.1,s68493.1,OAR1_420114.1,OAR1_537224_X.1,s35460.1


Coordinates seems to match the latest *SNPchiMp v.3* database