# CREOLE_INIA_UY
describe `CREOLE_INIA_UY.zip` file

In [1]:
import io
import csv
import itertools

from collections import Counter
from zipfile import ZipFile
from pathlib import Path

from src.features.smarterdb import VariantSheep, global_connection

In [2]:
project_dir = Path.cwd().parents[1]
datafile = project_dir / "data/raw/background/CREOLE_INIA_UY.zip"

Try to inspect dataset

In [3]:
handle = ZipFile(datafile)
handle.printdir()

File Name                                             Modified             Size
JCM2357_UGY_FinalReport1.txt                   2020-12-10 10:09:26   1964410641
JCM2357_UGY_FinalReport2.txt                   2020-12-10 10:09:34   1964401618
OvineHDSNPList.txt                             2020-12-10 10:09:44     31795931


This time I don't have *map/ped* files, but two final reports

In [4]:
with io.TextIOWrapper(handle.open("JCM2357_UGY_FinalReport1.txt"), encoding="utf-8") as f:
    for line in itertools.islice(f, 15):
        print(line.strip())

[Header]
GSGT Version	1.9.4
Processing Date	11/29/2013 4:47 PM
Content		SheepHD_AgResearch_Cons_15041608_A.bpm
Num SNPs	606006
Total SNPs	606006
Num Samples	174
Total Samples	182
File 	1 of 4
[Data]
SNP Name	Sample ID	Allele1 - AB	Allele2 - AB	X	Y	GC Score
250506CS3900140500001_312.1	JC2356_B01_20131108002	B	B	0.013	0.950	0.9403
250506CS3900176800001_906.1	JC2356_B01_20131108002	B	B	0.008	0.957	0.9288
250506CS3900211600001_1041.1	JC2356_B01_20131108002	A	B	0.681	0.620	0.9250
250506CS3900218700001_1294.1	JC2356_B01_20131108002	A	B	0.300	0.517	0.8089


Its seems that I have only 2 of total 4 files. Genotypes are described in *AB* format. How many SNPs I have for each sample?

In [5]:
counts1 = Counter()
with io.TextIOWrapper(handle.open("JCM2357_UGY_FinalReport1.txt"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    # skip first 10 lines
    [next(reader) for i in range(10)]
    header = next(reader)
    print(header)
    for line in reader:
        counts1[line[1]] += 1
print("N of samples: %s" % len(counts1))
max_snps = counts1.most_common()[0][1]
max_snps == 606006
for key, count in counts1.items():
    if count < max_snps:
        print(f"Sample {key} has {count} snps")

['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB', 'X', 'Y', 'GC Score']
N of samples: 49


Is seems that in the first file we have only 49 samples. Each sample has 606006 SNPs. What about the second file?

In [6]:
counts2 = Counter()
with io.TextIOWrapper(handle.open("JCM2357_UGY_FinalReport2.txt"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    # skip first 10 lines
    [next(reader) for i in range(10)]
    header = next(reader)
    print(header)
    for line in reader:
        counts2[line[1]] += 1
print("N of samples: %s" % len(counts2))
max_snps = counts2.most_common()[0][1]
max_snps == 606006
for key, count in counts2.items():
    if count < max_snps:
        print(f"Sample {key} has {count} snps")

['SNP Name', 'Sample ID', 'Allele1 - AB', 'Allele2 - AB', 'X', 'Y', 'GC Score']
N of samples: 49


Even the second file has 49 samples. Does the samples are different?

In [7]:
s1 = set(counts1)
s2 = set(counts2)
common = s1.intersection(s2)
print(f"Samples in common: {common}")
samples = s1.union(s2)
print(f"Samples total: {len(samples)}")

Samples in common: set()
Samples total: 98


I have 98 samples as described in the dataset for `CREOLE_INIA_UY.zip`. Are coordinates updated (at least for the set of SNPs in common between 50K and HD chips)?

In [8]:
global_connection()
with io.TextIOWrapper(handle.open("OvineHDSNPList.txt"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    header = next(reader)
    print(header)
    for line in itertools.islice(reader, 200):
        qs = VariantSheep.objects(name=line[0])
        if qs.count() > 0:
            variant = qs.get()
            location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
            if line[1] != location.chrom or int(line[2]) != location.position:
                print(f"snp {line[0]} with different positions: {line[1]}:{line[2]}<>{location.chrom}:{location.position}")

['Name', 'Chr', 'Position', 'Index', 'SNP', 'Customer Strand', 'ILMN Strand']
snp 250506CS3900283200001_442.1 with different positions: 1:188498238<>99:0
snp 250506CS3900371000001_1255.1 with different positions: 11:35339124<>11:35339123
snp DU176899_379.1 with different positions: 4:109822674<>99:0
snp DU186191_327.1 with different positions: 26:4328182<>26:4328183
snp DU191809_420.1 with different positions: 1:187088011<>1:187087905
snp DU205548_223.1 with different positions: 19:28181245<>99:0
snp DU206996_498.1 with different positions: 5:33163650<>5:33163649
snp DU235701_99.1 with different positions: 11:25545773<>99:0
snp DU247686_322.1 with different positions: 2:23366203<>99:0
snp DU281388_299.1 with different positions: 24:5384735<>99:0
snp DU287575_503.1 with different positions: 0:0<>99:0
snp DU287626_225.1 with different positions: 1:186538026<>99:0
snp DU289160_204.1 with different positions: 3:177231430<>3:177231431
snp DU299150_230.1 with different positions: 2:104364614

It seems to me that positions should be updated. Can I infer genotypes from this file?

In [9]:
with io.TextIOWrapper(handle.open("OvineHDSNPList.txt"), encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    header = next(reader)
    # print(header)
    print(["name", "SNP", "Customer Strand", "ILMN Strand", "alleles", "illumina_top", "strand", "ilmnstrand"])
    for line in itertools.islice(reader, 10):
        qs = VariantSheep.objects(name=line[0])
        if qs.count() > 0:
            variant = qs.get()
            location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
            print(f"{line[0]}: {','.join(line[4:])} -> {','.join([str(location.alleles), str(location.illumina_top), str(location.strand), str(location.ilmnstrand)])}")

['name', 'SNP', 'Customer Strand', 'ILMN Strand', 'alleles', 'illumina_top', 'strand', 'ilmnstrand']
250506CS3900140500001_312.1: [A/G],BOT,TOP -> C/T,A/G,bottom,forward
250506CS3900176800001_906.1: [T/C],BOT,BOT -> C/T,A/G,bottom,forward
250506CS3900211600001_1041.1: [A/C],BOT,TOP -> G/T,A/C,bottom,forward
250506CS3900218700001_1294.1: [A/G],BOT,TOP -> C/T,A/G,bottom,forward
250506CS3900283200001_442.1: [A/C],BOT,TOP -> None,A/C,None,None
250506CS3900371000001_1255.1: [T/C],BOT,BOT -> C/T,A/G,bottom,forward
250506CS3900386000001_696.1: [A/G],TOP,TOP -> A/G,A/G,top,forward
250506CS3900487100001_1521.1: [A/G],TOP,TOP -> A/G,A/G,top,forward
250506CS3901300500001_1084.1: [T/C],BOT,BOT -> C/T,A/G,bottom,forward
CL635241_413.1: [A/G],TOP,TOP -> A/G,A/G,top,forward
