# ovine_SNP50HapMap_data
Describing `ovine_SNP50HapMap_data.zip` data. Files contained different subarchives. I normalized all the contents in a single file with subfolder in order to improve usage

In [1]:
import io
import csv
import itertools

from collections import Counter
from zipfile import ZipFile
from pathlib import Path

import src.features.illumina
from src.features.smarterdb import VariantSheep, global_connection

In [2]:
project_dir = Path.cwd().parents[1]
datafile = project_dir / "data/raw/background/ovine_SNP50HapMap_data.zip"

In [3]:
handle = ZipFile(datafile)
handle.printdir()

File Name                                             Modified             Size
ovine_SNP50HapMap_data/                        2021-03-16 10:23:24            0
ovine_SNP50HapMap_data/SNP50_Breedv2/          2021-03-16 10:23:16            0
ovine_SNP50HapMap_data/SNP50_Breedv2/SNP50_Breedv2.map 2010-02-12 13:36:06      1610011
ovine_SNP50HapMap_data/SNP50_Breedv2/SNP50_Breedv2.ped 2010-02-12 13:36:26     27070481
ovine_SNP50HapMap_data/SNP50_Breedv2/ovine SNP50 Breedv2 data release.pdf 2010-02-12 13:21:02        46116
ovine_SNP50HapMap_data/Heaton/                 2021-03-16 10:22:46            0
ovine_SNP50HapMap_data/Heaton/Mike Heaton Sheep 07may2009_LocusSummary.csv 2009-05-29 16:03:28      9801077
ovine_SNP50HapMap_data/Heaton/Sample_Map.txt   2009-05-29 16:02:34          429
ovine_SNP50HapMap_data/Heaton/SNP_Map.txt      2009-05-29 16:02:38      3053566
ovine_SNP50HapMap_data/Heaton/Mike Heaton Sheep 07may2009_LocusXDNA.csv 2009-05-29 16:03:46      6480220
ovine_SNP50HapMap_data/H

This dataset have many subarchive inside. Let's do a quick check

In [4]:
archives = [name for name in handle.namelist() if name.endswith('.zip')]

In [5]:
for archive in archives:
    print(f"Opening {archive}\n")
    with handle.open(archive) as subhandle:
        subarchive = ZipFile(subhandle)
        subarchive.printdir()
    print("="*80 + "\n")

Files are quite old. I suppose the are in the old 3.1 coordinates system

In [6]:
chip_dir = project_dir / "data/external/SHE/ILLUMINA/"
old_chip3_file = chip_dir / "ovinesnp50_b.csv"
old_chip3 = dict()
for record in src.features.illumina.read_snpChip(old_chip3_file):
    old_chip3[record.name] = (record.chr, record.mapinfo)

## Heaton

This dataset have no map/ped files. I have no information about breeds or country. I need to process the SNP_Map file

In [7]:
data_coordinates = dict()
global_connection()

with io.TextIOWrapper(handle.open("ovine_SNP50HapMap_data/Heaton/SNP_Map.txt"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    header = next(reader)
    # print(header)
    for line in itertools.islice(reader, 10):
        qs = VariantSheep.objects(name=line[1])
        if qs.count() > 0:
            variant = qs.get()
            location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
            if line[2] != location.chrom or int(line[3]) != location.position:
                print(f"snp {line[1]} with different positions: {line[2]}:{line[3]} <> {location.chrom}:{location.position}")

snp 250506CS3900065000002_1238.1 with different positions: 15:5327353 <> 15:5870057
snp 250506CS3900140500001_312.1 with different positions: 23:27428869 <> 23:26298017
snp 250506CS3900176800001_906.1 with different positions: 7:89002990 <> 7:81648528
snp 250506CS3900211600001_1041.1 with different positions: 16:44955568 <> 16:41355381
snp 250506CS3900218700001_1294.1 with different positions: 2:157820235 <> 2:148802744
snp 250506CS3900283200001_442.1 with different positions: 1:203289635 <> 99:0
snp 250506CS3900371000001_1255.1 with different positions: 11:37632867 <> 11:35339123
snp 250506CS3900386000001_696.1 with different positions: 16:68297712 <> 16:62646307
snp 250506CS3900414400001_1178.1 with different positions: 1:111100644 <> 1:103396552
snp 250506CS3900435700001_1658.1 with different positions: 12:50140951 <> 99:0


Heaton coordinates are in outdated 3.1

## Parentage_04_may_09

This file seems to have only PED informations

In [8]:
data = []

with io.TextIOWrapper(handle.open("ovine_SNP50HapMap_data/Parentage_04_may_09.PED"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    for line in reader:
        data.append(line)

# snp pairs are separated by single spaces. "\t" define file columns
print(f"Data have {len(data)} samples and {len(data[0])-6} snps")

Data have 97 samples and 49034 snps


I suppose that such data comes from 50K Illumina sheep, however I have no information on map (snp names and coordinates), so this dataset must be discarded

## SNP50_Breedv1
This seems to be the first release of [SheepHapMap](www.sheephapmap.org) data. There are multiple breeds coming from different countries, however I can't get info on countries nor find all breeds in dad-is. Some of them are cross-breeds

In [9]:
data_coordinates = dict()

with io.TextIOWrapper(handle.open("ovine_SNP50HapMap_data/SNP50_Breedv1/SNP50_Breedv1.map"), encoding="utf-8") as f:
    for record in f:
        record = record.split()
        data_coordinates[record[1]] = (record[0], int(record[3]))

In [10]:
count = 0
missing = 0

for key, value in old_chip3.items():
    if not key in data_coordinates:
        missing += 1
        continue
        
    if value != data_coordinates[key]:
        count += 1
        if count <= 10:
            print(key, value, data_coordinates[key])
        
print(f"\nN of SNPs in different positions in SNP50_Breedv1 and old chip3: {count}")
print(f"\nN of SNPs in SNP50_Breedv1 not in chip: {missing}")

250506CS3900539000001_471.1 ('X', 74622875) ('0', 0)
CL635944_160.1 ('0', 0) ('6', 0)
Contig35697_5761.1 ('0', 0) ('6', 0)
CZ925803_293.1 ('0', 0) ('6', 0)
DU178311_404.1 ('0', 0) ('6', 0)
DU185362_365.1 ('0', 0) ('23', 0)
DU189586_521.1 ('0', 0) ('27', 0)
DU199514_430.1 ('X', 20036615) ('27', 20036615)
DU202534_254.1 ('0', 0) ('1', 0)
DU205124_325.1 ('0', 0) ('5', 0)

N of SNPs in different positions in SNP50_Breedv1 and old chip3: 1578

N of SNPs in SNP50_Breedv1 not in chip: 5207


Those coordinates need to be checked

## SNP50_Breedv2
This is an update of the sheep hapmap data, with a few more animals. The same considerations made for the first release apply also here

In [11]:
data_coordinates = dict()

with io.TextIOWrapper(handle.open("ovine_SNP50HapMap_data/SNP50_Breedv2/SNP50_Breedv2.map"), encoding="utf-8") as f:
    for record in f:
        record = record.split()
        data_coordinates[record[1]] = (record[0], int(record[3]))

In [12]:
count = 0
missing = 0

for key, value in old_chip3.items():
    if not key in data_coordinates:
        missing += 1
        continue
        
    if value != data_coordinates[key]:
        count += 1
        if count <= 10:
            print(key, value, data_coordinates[key])
        
print(f"\nN of SNPs in different positions in SNP50_Breedv2 and old chip3: {count}")
print(f"\nN of SNPs in SNP50_Breedv2 not in chip: {missing}")

250506CS3900539000001_471.1 ('X', 74622875) ('27', 74622875)
DU199514_430.1 ('X', 20036615) ('27', 20036615)
DU211594_392.1 ('X', 42984493) ('27', 42984493)
DU225785_452.1 ('X', 31318207) ('27', 31318207)
DU240765_244.1 ('X', 80622390) ('27', 80622390)
DU275403_434.1 ('X', 112895373) ('27', 112895373)
DU278506_495.1 ('X', 90275952) ('27', 90275952)
DU322639_600.1 ('X', 51771664) ('27', 51771664)
DU330424_111.1 ('X', 38136047) ('27', 38136047)
DU364592_286.1 ('X', 79173304) ('27', 79173304)

N of SNPs in different positions in SNP50_Breedv2 and old chip3: 1258

N of SNPs in SNP50_Breedv2 not in chip: 5207


Those coordinates need to be checked