# MERINO_INIA_UY
Describing `MERINO_INIA_UY.zip` file

In [1]:
import io
import csv
import itertools

from collections import Counter
from zipfile import ZipFile
from pathlib import Path

import src.features.illumina
from src.features.smarterdb import VariantSheep, global_connection

In [2]:
project_dir = Path.cwd().parents[1]
datafile = project_dir / "data/raw/background/MERINO_INIA_UY.zip"

In [3]:
handle = ZipFile(datafile)
handle.printdir()

File Name                                             Modified             Size
Merino_21_12_17_OV54k_FinalReport.txt          2020-12-03 14:37:44    383851065
Merino_21_12_17_OV54k_SNP_Map.txt              2020-12-03 14:37:54      3012148
MERINO_UY_96_21_12_17_OV54k.ped                2020-12-09 17:24:20     20831616
MERINO_UY_96_21_12_17_OV54k.map                2020-12-09 17:24:22      1502260


Merino dataset have map/ped with final report and snp map. Get info from final report:

In [4]:
with io.TextIOWrapper(handle.open("Merino_21_12_17_OV54k_FinalReport.txt"), encoding="utf-8") as f:
    for line in itertools.islice(f, 12):
        print(line.strip())

[Header]
GSGT Version	2.0.3
Processing Date	12/21/2017 11:39 AM
Content		ovinesnp50_b.bpm
Num SNPs	54241
Total SNPs	54241
Num Samples	96
Total Samples	96
[Data]
SNP Name	Sample ID	Allele1 - Forward	Allele2 - Forward	Allele1 - AB	Allele2 - AB	Allele1 - Top	Allele2 - Top	GC Score	X	Y	B Allele Freq	Log R Ratio
250506CS3900065000002_1238.1	201711200001	T	C	A	B	A	G	0.9239	0.727	0.647	0.5027	-0.1292
250506CS3900140500001_312.1	201711200001	T	T	A	A	A	A	0.9613	0.654	0.005	0.0000	-0.1364


Data seems to be recent (2017). Are coordinates in latest 3.1?

In [5]:
data_coordinates = dict()
with io.TextIOWrapper(handle.open("MERINO_UY_96_21_12_17_OV54k.map"), encoding="utf-8") as f:
    for record in f:
        record = record.split()
        data_coordinates[record[1]] = (record[0], int(record[3]))

In [6]:
chip_dir = project_dir / "data/external/SHE/ILLUMINA/"
old_chip3_file = chip_dir / "ovinesnp50_b.csv"
old_chip3 = dict()
for record in src.features.illumina.read_snpChip(old_chip3_file):
    old_chip3[record.name] = (record.chr, record.mapinfo)

In [7]:
count = 0
missing = 0

for key, value in old_chip3.items():
    if not key in data_coordinates:
        missing += 1
        continue
        
    if value != data_coordinates[key]:
        count += 1
        if count <= 10:
            print(key, value, data_coordinates[key])
        
print(f"\nN of SNPs in different positions in merino and old chip3: {count}")
print(f"\nN of SNPs in merino not in chip: {missing}")

s17862.1 ('Contig', 0) ('CONTIG', 0)

N of SNPs in different positions in merino and old chip3: 1

N of SNPs in merino not in chip: 0


Despite data were made in 2017, coordinates seems to be in the old reference. Check with database:

In [8]:
global_connection()
for key, value in itertools.islice(data_coordinates.items(), 20):
    qs = VariantSheep.objects(name=key)
    
    if qs.count() > 0:
        variant = qs.get()
        location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
        if value[0] != location.chrom or int(value[1]) != location.position:
            print(f"snp {key} with different positions: {value[0]}:{value[1]} <> {location.chrom}:{location.position}")

snp 250506CS3900065000002_1238.1 with different positions: 15:5327353 <> 15:5870057
snp 250506CS3900140500001_312.1 with different positions: 23:27428869 <> 23:26298017
snp 250506CS3900176800001_906.1 with different positions: 7:89002990 <> 7:81648528
snp 250506CS3900211600001_1041.1 with different positions: 16:44955568 <> 16:41355381
snp 250506CS3900218700001_1294.1 with different positions: 2:157820235 <> 2:148802744
snp 250506CS3900283200001_442.1 with different positions: 1:203289635 <> 99:0
snp 250506CS3900371000001_1255.1 with different positions: 11:37632867 <> 11:35339123
snp 250506CS3900386000001_696.1 with different positions: 16:68297712 <> 16:62646307
snp 250506CS3900414400001_1178.1 with different positions: 1:111100644 <> 1:103396552
snp 250506CS3900435700001_1658.1 with different positions: 12:50140951 <> 99:0
snp 250506CS3900464100001_519.1 with different positions: 1:91075445 <> 1:85767398
snp 250506CS3900487100001_1521.1 with different positions: 14:1552575 <> 14:111