# About Sheep Coordinates

In [1]:
import os
import itertools

from pathlib import Path
from importlib import reload

import src.features.illumina
import src.features.snpchimp

In [2]:
reload(src.features.illumina)
reload(src.features.snpchimp)

<module 'src.features.snpchimp' from '/home/paolo/Projects/SMARTER-database/src/features/snpchimp.py'>

In [3]:
project_dir = Path(os.getcwd()).resolve().parents[1]
chip_dir = os.path.join(project_dir, "data", "external", "SHE", "ILLUMINA")
os.listdir(chip_dir)

['ovinesnp50-genome-assembly-oar-v3-1.csv',
 'ovinesnp50_b.csv',
 'OvineSNP50_B2.csv']

There are 3 different chip version I've found. `ovinesnp50_b.csv` is the oldest version I found from *SNPchiMp* raw data, `ovinesnp50-genome-assembly-oar-v3-1.csv` is the **OAR_v3** manifest file downloaded from *ILLUMINA* and `OvineSNP50_B2.csv` is the latest **OAR_v4** manifest file downloaded from *ILLUMINA*

Let's start and read the oldest chip I have. Get info about size:

In [4]:
old_chip3 = dict()
for record in src.features.illumina.read_snpChip(os.path.join(chip_dir, "ovinesnp50_b.csv")):
    old_chip3[record.name] = (record.chr, record.mapinfo)

In [5]:
print(list(old_chip3.keys())[:10])

['250506CS3900065000002_1238.1', '250506CS3900140500001_312.1', '250506CS3900176800001_906.1', '250506CS3900211600001_1041.1', '250506CS3900218700001_1294.1', '250506CS3900283200001_442.1', '250506CS3900371000001_1255.1', '250506CS3900386000001_696.1', '250506CS3900414400001_1178.1', '250506CS3900435700001_1658.1']


In [6]:
print("Old chip size: %s" % len(old_chip3.keys()))

Old chip size: 54241


Now try to read the newest chip I downloaded from illumina

In [7]:
new_chip3 = dict()
for record in src.features.illumina.read_snpChip(os.path.join(chip_dir, "ovinesnp50-genome-assembly-oar-v3-1.csv")):
    new_chip3[record.name] = (record.chr, record.mapinfo)

In [8]:
print(list(new_chip3.keys())[:10])

['250506CS3900065000002_1238.1', '250506CS3900140500001_312.1', '250506CS3900176800001_906.1', '250506CS3900211600001_1041.1', '250506CS3900218700001_1294.1', '250506CS3900283200001_442.1', '250506CS3900371000001_1255.1', '250506CS3900386000001_696.1', '250506CS3900414400001_1178.1', '250506CS3900435700001_1658.1']


In [9]:
print("New chip size: %s" % len(new_chip3.keys()))

New chip size: 54241


Chips have the same size. Ensure that also the keys (SNP names) are the same 

In [10]:
sorted(old_chip3.keys()) == sorted(new_chip3.keys())

True

Print out the first 10 SNP positions for both chips:

In [11]:
for key, value in itertools.islice(old_chip3.items(), 10):
    print(key, value, new_chip3[key])

250506CS3900065000002_1238.1 ('15', 5327353) ('15', 5870057)
250506CS3900140500001_312.1 ('23', 27428869) ('23', 26298017)
250506CS3900176800001_906.1 ('7', 89002990) ('7', 81648528)
250506CS3900211600001_1041.1 ('16', 44955568) ('16', 41355381)
250506CS3900218700001_1294.1 ('2', 157820235) ('2', 148802744)
250506CS3900283200001_442.1 ('1', 203289635) ('1', 188498238)
250506CS3900371000001_1255.1 ('11', 37632867) ('11', 35339123)
250506CS3900386000001_696.1 ('16', 68297712) ('16', 62646307)
250506CS3900414400001_1178.1 ('1', 111100644) ('1', 103396552)
250506CS3900435700001_1658.1 ('12', 50140951) ('12', 45221821)


Now count how many different positions I have. If positions are identical, print out to the terminal

In [12]:
count = 0

for key, value in old_chip3.items():
    if value != new_chip3[key]:
        count += 1
    else:
        print(key, value, new_chip3[key])
        
print(f"\nN of SNPs in different positions in the two file versions: {count}")

CR_594.1 ('0', 0) ('0', 0)
mt_12362.1 ('0', 0) ('0', 0)
mt_5800.1 ('0', 0) ('0', 0)
mt_7729.1 ('0', 0) ('0', 0)
OARUn.1162_15670.1 ('0', 0) ('0', 0)
s05987.1 ('0', 0) ('0', 0)
s15970.1 ('0', 0) ('0', 0)
s20217.1 ('0', 0) ('0', 0)
s23436.1 ('0', 0) ('0', 0)
s24503.1 ('0', 0) ('0', 0)
s26614.1 ('0', 0) ('0', 0)
s27919.1 ('0', 0) ('0', 0)
s37920.1 ('0', 0) ('0', 0)
s38683.1 ('0', 0) ('0', 0)
s42402.1 ('0', 0) ('0', 0)
s44857.1 ('0', 0) ('0', 0)
s51062.1 ('0', 0) ('0', 0)
s51315.1 ('0', 0) ('0', 0)
s59566.1 ('0', 0) ('0', 0)
s63164.1 ('0', 0) ('0', 0)
s73227.1 ('0', 0) ('0', 0)

N of SNPs in different positions in the two file versions: 54220


Get a list of all chromosomes present in the two chips

In [13]:
new_chroms = set([el[0] for el in new_chip3.values()])
old_chroms = set([el[0] for el in old_chip3.values()])
new_chroms == old_chroms
print(old_chroms)
print(new_chroms)

{'X', '7', '5', 'OAR', '19', '22', '1', '16', '10', '21', '25', '12', '26', '23', '17', '6', '14', '18', 'Y', '24', '4', '3', '9', '2', 'Contig', '0', '11', '20', '15', '8', '13'}
{'X', '7', '5', '19', '22', '1', '16', '10', '21', '25', '12', '26', '23', '17', '6', '14', '18', '24', '4', '3', '9', '2', '0', '11', 'M', '20', '15', '8', '13'}


How many chromosomes are in common? which chromosomes are in one version and not in the another one?

In [14]:
print("Chromosomes in common: %s" % old_chroms.intersection(new_chroms))
print("Chromosomes only in old release: %s" % old_chroms.difference(new_chroms))
print("Chromosomes only in new release: %s" % new_chroms.difference(old_chroms))

Chromosomes in common: {'X', '7', '5', '19', '22', '1', '16', '10', '21', '12', '25', '26', '23', '17', '6', '14', '18', '24', '4', '3', '9', '2', '0', '11', '20', '15', '8', '13'}
Chromosomes only in old release: {'Contig', 'Y', 'OAR'}
Chromosomes only in new release: {'M'}


Chromosomes are different in two chips. Print out some record relying on chromosome name:

In [15]:
for key, value in old_chip3.items():
    if value[0] == 'Contig':
        print(key, value)

s17862.1 ('Contig', 0)


## IS SNPchimp updated?
### Does SNPChimp store the latest informations for *oarv_3.1*?

In [16]:
snpchimp_dir = Path(project_dir, "data", "external", "SHE", "SNPCHIMP")
snpchimp3_file = snpchimp_dir / "SNPchimp_SHE_SNP50v1_oar3.1.csv"
snpchimp3 = dict()
for record in src.features.snpchimp.read_snpChimp(snpchimp3_file):
    snpchimp3[record.snp_name] = (record.chromosome, record.position)

In [17]:
print(list(snpchimp3.keys())[:10])

['DU179070_177.1', 'DU412523_531.1', 'DU435204_267.1', 'DU348827_210.1', 'DU351298_316.1', 'DU427993_302.1', 'DU378652_409.1', 'DU299578_392.1', 'OAR1_181838398.1', 'DU518561_359.1']


In [18]:
print("SNPchimp snps for oarv3: %s" % len(snpchimp3.keys()))

SNPchimp snps for oarv3: 54241


In [19]:
sorted(old_chip3.keys()) == sorted(snpchimp3.keys())

True

SNPchimp seems to have the same SNPs name of oarv3 illumina chips. Display some snps from old and new illumina chip and the snpchimp coordinates:

In [20]:
for key, value in itertools.islice(old_chip3.items(), 10):
    print(key, value, new_chip3[key], snpchimp3[key])

250506CS3900065000002_1238.1 ('15', 5327353) ('15', 5870057) ('15', 5870057)
250506CS3900140500001_312.1 ('23', 27428869) ('23', 26298017) ('23', 26298017)
250506CS3900176800001_906.1 ('7', 89002990) ('7', 81648528) ('7', 81648528)
250506CS3900211600001_1041.1 ('16', 44955568) ('16', 41355381) ('16', 41355381)
250506CS3900218700001_1294.1 ('2', 157820235) ('2', 148802744) ('2', 148802744)
250506CS3900283200001_442.1 ('1', 203289635) ('1', 188498238) ('99', 0)
250506CS3900371000001_1255.1 ('11', 37632867) ('11', 35339123) ('11', 35339123)
250506CS3900386000001_696.1 ('16', 68297712) ('16', 62646307) ('16', 62646307)
250506CS3900414400001_1178.1 ('1', 111100644) ('1', 103396552) ('1', 103396552)
250506CS3900435700001_1658.1 ('12', 50140951) ('12', 45221821) ('99', 0)


If the coordinates downloaded from illumina are correct **It seems clear that SNPchimp coordinates need to be updated**. However which snps are updated from SNPchimp to illumina site?

In [21]:
count = 0

for i, (key, value) in enumerate(new_chip3.items()):
    if value != snpchimp3[key]:
        count += 1
    if i < 10:
        print(key, value, snpchimp3[key])
    elif i == 10:
        print("...")
        
print(f"\nN of SNPs in different positions from illumina to SNPchimp: {count}")

250506CS3900065000002_1238.1 ('15', 5870057) ('15', 5870057)
250506CS3900140500001_312.1 ('23', 26298017) ('23', 26298017)
250506CS3900176800001_906.1 ('7', 81648528) ('7', 81648528)
250506CS3900211600001_1041.1 ('16', 41355381) ('16', 41355381)
250506CS3900218700001_1294.1 ('2', 148802744) ('2', 148802744)
250506CS3900283200001_442.1 ('1', 188498238) ('99', 0)
250506CS3900371000001_1255.1 ('11', 35339123) ('11', 35339123)
250506CS3900386000001_696.1 ('16', 62646307) ('16', 62646307)
250506CS3900414400001_1178.1 ('1', 103396552) ('1', 103396552)
250506CS3900435700001_1658.1 ('12', 45221821) ('99', 0)
...

N of SNPs in different positions from illumina to SNPchimp: 6463


Last consideration: is illumina codification equal to SNPchimp? I expect yes for *TOP/BOT*

In [22]:
snpchimp3_code = dict()
for record in src.features.snpchimp.read_snpChimp(snpchimp3_file):
    snpchimp3_code[record.snp_name] = (record.strand, record.alleles, record.alleles_a_b_top)
    
new_chip3_code = dict()
for record in src.features.illumina.read_snpChip(Path(chip_dir, "ovinesnp50-genome-assembly-oar-v3-1.csv")):
    new_chip3_code[record.name] = (record.ilmnstrand, record.snp, record.sourcestrand)
    
for key, value in itertools.islice(new_chip3_code.items(), 20):
    print(key, value, snpchimp3_code[key])

250506CS3900065000002_1238.1 ('TOP', '[A/G]', 'BOT') ('bottom', 'C/T', 'A/G')
250506CS3900140500001_312.1 ('TOP', '[A/G]', 'BOT') ('bottom', 'C/T', 'A/G')
250506CS3900176800001_906.1 ('BOT', '[T/C]', 'BOT') ('bottom', 'C/T', 'A/G')
250506CS3900211600001_1041.1 ('TOP', '[A/C]', 'BOT') ('bottom', 'G/T', 'A/C')
250506CS3900218700001_1294.1 ('TOP', '[A/G]', 'BOT') ('bottom', 'C/T', 'A/G')
250506CS3900283200001_442.1 ('TOP', '[A/C]', 'BOT') ('NULL', 'NULL', 'A/C')
250506CS3900371000001_1255.1 ('BOT', '[T/C]', 'BOT') ('bottom', 'C/T', 'A/G')
250506CS3900386000001_696.1 ('TOP', '[A/G]', 'TOP') ('top', 'A/G', 'A/G')
250506CS3900414400001_1178.1 ('BOT', '[T/C]', 'TOP') ('top', 'A/G', 'A/G')
250506CS3900435700001_1658.1 ('BOT', '[T/C]', 'TOP') ('NULL', 'NULL', 'A/G')
250506CS3900464100001_519.1 ('TOP', '[A/G]', 'TOP') ('top', 'A/G', 'A/G')
250506CS3900487100001_1521.1 ('TOP', '[A/G]', 'TOP') ('top', 'A/G', 'A/G')
250506CS3900539000001_471.1 ('TOP', '[A/G]', 'BOT') ('bottom', 'C/T', 'A/G')
250506

Everytime that illumina has a BOT in `ilmstrand` the `alleles_a_b_top` in snipchimp is reversed complement

### Does SNPChimp store the latest informations for *oarv_4*?

In [23]:
chip4 = dict()
for record in src.features.illumina.read_snpChip(os.path.join(chip_dir, "OvineSNP50_B2.csv"), size=256):
    chip4[record.name] = (record.chr, record.mapinfo)
print(list(chip4.keys())[:10])
print("Oarv4 chip size: %s" % len(chip4.keys()))

['250506CS3900065000002_1238.1', '250506CS3900140500001_312.1', '250506CS3900176800001_906.1', '250506CS3900211600001_1041.1', '250506CS3900218700001_1294.1', '250506CS3900283200001_442.1', '250506CS3900371000001_1255.1', '250506CS3900386000001_696.1', '250506CS3900414400001_1178.1', '250506CS3900435700001_1658.1']
Oarv4 chip size: 54241


Are the keys the same? using `old_chip3` as a reference (`new_chip3` has the same keys)

In [24]:
sorted(old_chip3.keys()) == sorted(chip4.keys())

True

Does SNPchimp have the same keys and positions of `chip4`?

In [25]:
snpchimp4_file = snpchimp_dir / "SNPchimp_SHE_SNP50v1_oar4.0.csv"
snpchimp4 = dict()
for record in src.features.snpchimp.read_snpChimp(snpchimp4_file):
    snpchimp4[record.snp_name] = (record.chromosome, record.position)
print("SNPchimp snps for oarv4: %s" % len(snpchimp4.keys()))
sorted(chip4.keys()) == sorted(snpchimp4.keys())

SNPchimp snps for oarv4: 54241


True

Should SNPchimp oarv4 coordinates be updated?

In [26]:
count = 0

for i, (key, value) in enumerate(chip4.items()):
    if value != snpchimp4[key]:
        count += 1
    if i < 10:
        print(key, value, snpchimp4[key])
    elif i == 10:
        print("...")
        
print(f"\nN of SNPs in different positions from illumina to SNPchimp: {count}")

250506CS3900065000002_1238.1 ('15', 5859890) ('15', 5859890)
250506CS3900140500001_312.1 ('23', 26243215) ('23', 26243215)
250506CS3900176800001_906.1 ('7', 81590897) ('7', 81590897)
250506CS3900211600001_1041.1 ('16', 41363310) ('16', 41363310)
250506CS3900218700001_1294.1 ('2', 148834939) ('2', 148834939)
250506CS3900283200001_442.1 ('1', 188328803) ('99', 0)
250506CS3900371000001_1255.1 ('11', 35291132) ('11', 35291132)
250506CS3900386000001_696.1 ('16', 62648296) ('16', 62648296)
250506CS3900414400001_1178.1 ('1', 103285485) ('1', 103285485)
250506CS3900435700001_1658.1 ('12', 45150716) ('99', 0)
...

N of SNPs in different positions from illumina to SNPchimp: 6471


There are ~ 6500 SNPs in SNPchimp that need to be updated