# SMARTER and dbSNP152
This is an attempt to investigate on *dbSNP* xml file and determining if this information can be integrated into **SMARTER** database. Ok, import some libraries:

In [1]:
from functools import partial

import json
import copy
import logging

from src.features.smarterdb import VariantSheep, VariantGoat, global_connection, Location
from src.features.dbsnp import read_dbSNP, search_chip_snps
from src.features.illumina import IlluSNP

In [2]:
conn = global_connection()
logging.getLogger('src.features.dbsnp').setLevel(logging.ERROR)
logger = logging.getLogger(__name__)

## Sheep test
Focusing on Sheep, first. Try to extract all the SNP names I have from illumina manifest:

In [3]:
all_snp_names = set([variant.name for variant in VariantSheep.objects.filter(chip_name__in=["IlluminaOvineSNP50", "IlluminaOvineHDSNP"]).fields(name=1)])

Try to define some functions which can help me to find SNPs of interest

In [4]:
def filter_ss(snp: dict, handle: str):
    """Getting a SNP object. Return a new SNP object
    with the exemplar SS and the handle SS. Is the 
    exemplar is the same as handle, return only a SS
    object"""
    
    new_ss = []
    new_snp = copy.deepcopy(snp)
    exemplar = snp['exemplar']['exemplarSs']
    filtered = []
    for ss in snp['ss']:
        if ss['ssId'] == exemplar or ss['handle'] == handle:
            if ss['ssId'] not in filtered:
                new_ss.append(ss)
                filtered.append(ss['ssId'])
    new_snp['ss'] = new_ss
    return new_snp


def find_SNPs(filename, handle, version, variantspecie, source="dbSNP151"):
    global all_snp_names
    
    handle_filter = partial(search_chip_snps, handle=handle)
    
    for snp in filter(handle_filter, read_dbSNP(filename)):
        # first filter out SS
        filtered_snp = filter_ss(snp, handle)
    
        # now get only the SS objects with the required handle
        sss = list(filter(lambda ss: ss['handle'] == handle, snp['ss']))
        
        # test for locSnpId in my database
        locSnpIds = set([ss['locSnpId'] for ss in sss])
        
        # Skip variants not in database
        if not locSnpIds.intersection(all_snp_names):
            continue

        if len(sss) > 1:
            logger.debug(f"More than 1 ss found for 'rs{snp['rsId']}'")

            locSnpIds = [ss['locSnpId'] for ss in sss]
            variants = variantspecie.objects.filter(name__in=locSnpIds)

        elif len(sss) == 1:
            ss = sss[0]

            # ok get a variant from database and return it
            variants = variantspecie.objects.filter(name=ss['locSnpId'])
            
        if len(variants) > 1:
            logger.warning(f"Got {len(variants)} Variants for 'rs{snp['rsId']}'")

        for variant in variants:
            # get the SS relying on ss[locSnpId']
            ss = next(filter(lambda ss: ss['locSnpId']==variant.name, filtered_snp['ss']))
            assembly = filtered_snp.get('assembly')

            # dbSNP is the primary source of SNPchiMp
            location = variant.get_location(version, imported_from="SNPchiMp v.3")

            # get illumina sequence
            if "IlluminaOvineHDSNP" in variant.sequence:
                sequence = variant.sequence["IlluminaOvineHDSNP"]
                
            elif "IlluminaGoatSNP50" in variant.sequence:
                sequence = variant.sequence["IlluminaGoatSNP50"]
            else:
                sequence = variant.sequence["IlluminaOvineSNP50"]
                
            illu_snp = IlluSNP(sequence=sequence, max_iter=25)
            
            if assembly and 'chromosome' in assembly['component'] and assembly['snpstat']['mapWeight'] == 'unique-in-contig':
                # read chromosome and position
                chromosome = assembly['component']['chromosome']
                position = int(assembly['component']['maploc']['physMapInt'])+1
                    
                # create a new location object
                new_location = Location(
                    ss_id=f"ss{ss['ssId']}", 
                    version=version, 
                    imported_from=source, 
                    chrom=chromosome, 
                    position=position, 
                    alleles=ss['observed'], 
                    illumina_strand=ss.get('strand', illu_snp.strand), 
                    strand=ss.get('orient'), 
                    illumina=illu_snp.illumina
                )
                
            else:
                # create a new location object for a not-mapped SNP
                new_location = Location(
                    ss_id=f"ss{ss['ssId']}", 
                    version=version, 
                    imported_from=source, 
                    chrom="0", 
                    position=0, 
                    alleles=ss['observed'], 
                    illumina_strand=ss.get('strand', illu_snp.strand), 
                    strand=ss.get('orient'), 
                    illumina=illu_snp.illumina
                )
                
            # test for equality
            if new_location != location:
                logger.warning(f"Locations differ for 'rs{snp['rsId']}': {location} <> {new_location}")

            yield variant.name, new_location, filtered_snp
            

find_AGRBS_SNPs = partial(find_SNPs, handle='AGR_BS', version="Oar_v4.0", variantspecie=VariantSheep)


find_IGGC_SNPs = partial(find_SNPs, handle='IGGC', version="CHI1.0", variantspecie=VariantGoat)

Ok try to read the *dbSNP* dump file for *sheep* chromosome `24`:

In [5]:
data = list([name, location, snp] for name, location, snp in find_AGRBS_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/SHE/BUILD152/ds_ch24.xml.gz"))

In [6]:
print(f"There are {len(data)} SNPs read from chromosome 24")

There are 10546 SNPs read from chromosome 24


Are there any SNP placed on chromosome 24 not updated by this process? First collect all variant names from previous step

In [7]:
names = [line[0] for line in data]

Now search for SNP in database not in the latest *dbSNP*

In [8]:
not_in_dbsnp = VariantSheep.objects.filter(
    locations__match={"version": "Oar_v4.0", "chrom": "24", "imported_from": "SNPchiMp v.3"}, 
    chip_name="IlluminaOvineSNP50", 
    name__nin=names)
not_in_dbsnp

[<VariantSheep: name='s03210.1', rs_id='['rs423317607']', illumina_top='A/G'>, <VariantSheep: name='s34597.1', rs_id='['rs424386834']', illumina_top='A/G'>, <VariantSheep: name='DU443720_334.1', rs_id='['rs411374027']', illumina_top='A/G'>]

I found snp `s03210.1` in `ds_chMulti.xml.gz` file: this SNP seems to be removed from NCBI cause multi mapping. The same *rsId* is absent eve in EVA. Are there any other SNP removed from dbSNP?

In [9]:
multi = list([name, location, snp] for name, location, snp in find_AGRBS_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/SHE/BUILD152/ds_chMulti.xml.gz"))
print(f"There are {len(multi)} SNP with multiple mapping positions")

Locations differ for 'rs160794292': (SNPchiMp v.3:Oar_v4.0) 12:49430945 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs398625579': (SNPchiMp v.3:Oar_v4.0) 14:39009087 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs399204443': (SNPchiMp v.3:Oar_v4.0) 15:16260018 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs399565101': (SNPchiMp v.3:Oar_v4.0) 15:3426211 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs399927424': (SNPchiMp v.3:Oar_v4.0) 2:236844974 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs400201062': (SNPchiMp v.3:Oar_v4.0) 26:2270791 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs400254267': (SNPchiMp v.3:Oar_v4.0) 6:9752440 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locations differ for 'rs400434071': (SNPchiMp v.3:Oar_v4.0) 21:952413 [C/G] <> (dbSNP151:Oar_v4.0) 0:0 [C/G]
Locations differ for 'rs400456207': (SNPchiMp v.3:Oar_v4.0) 10:57742973 [A/G] <> (dbSNP151:Oar_v4.0) 0:0 [A/G]
Locatio

There are 191 SNP with multiple mapping positions


And are they any SNP with no chromosome positions (unmapped)?

In [10]:
noton = list([name, location, snp] for name, location, snp in find_AGRBS_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/SHE/BUILD152/ds_chNotOn.xml.gz"))
print(f"There are {len(noton)} SNP with no mapping positions")

Got 2 Variants for 'rs160403113'
Got 2 Variants for 'rs402137533'
Got 2 Variants for 'rs411572125'
Got 2 Variants for 'rs414994086'
Got 2 Variants for 'rs417009700'
Got 2 Variants for 'rs419271878'
Got 2 Variants for 'rs421030064'
Got 2 Variants for 'rs424177120'
Got 2 Variants for 'rs424922202'
Got 2 Variants for 'rs429936770'


There are 167 SNP with no mapping positions


## Goat Test
Try to reapply the same but for goat, this time:

In [11]:
all_snp_names = set([variant.name for variant in VariantGoat.objects.filter(chip_name="IlluminaGoatSNP50").fields(name=1)])

Ok try to read the *dbSNP* dump file for *goat* chromosome `25`:

In [12]:
data = list([name, location, snp] for name, location, snp in find_IGGC_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/GOA/BUILD152/ds_ch25.xml.gz"))

In [13]:
print(f"There are {len(data)} SNPs read from chromosome 25")

There are 853 SNPs read from chromosome 25


Are there any SNP placed on chromosome 24 not updated by this process? First collect all variant names from previous step

In [14]:
names = [line[0] for line in data]

Now search for SNP in database not in the latest *dbSNP*

In [15]:
not_in_dbsnp = VariantGoat.objects.filter(
    locations__match={"version": "CHI1.0", "chrom": "25", "imported_from": "SNPchiMp v.3"}, 
    chip_name="IlluminaGoatSNP50", 
    name__nin=names)
not_in_dbsnp

[<VariantGoat: name='snp25070-scaffold2586-209927', rs_id='['rs268257454']', illumina_top='A/G'>, <VariantGoat: name='snp16184-scaffold1701-94158', rs_id='['rs268291835']', illumina_top='A/G'>]

I found snp `snp25070-scaffold2586-209927` in `ds_chNotOn.xml.gz` file: this SNP seems to be removed from NCBI cause no mapping. The same *rsId* is absent eve in EVA. Are there any other SNP removed from dbSNP?

In [16]:
multi = list([name, location, snp] for name, location, snp in find_IGGC_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/GOA/BUILD152/ds_chMulti.xml.gz"))
print(f"There are {len(multi)} SNP with multiple mapping positions")

Locations differ for 'rs268235075': (SNPchiMp v.3:CHI1.0) 2:131228065 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268235275': (SNPchiMp v.3:CHI1.0) 2:68801803 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268238038': (SNPchiMp v.3:CHI1.0) 4:47678708 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268239832': (SNPchiMp v.3:CHI1.0) 22:1242308 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268241044': (SNPchiMp v.3:CHI1.0) X:121318575 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268244874': (SNPchiMp v.3:CHI1.0) X:85504653 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268244930': (SNPchiMp v.3:CHI1.0) 17:61881296 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268245436': (SNPchiMp v.3:CHI1.0) 26:15128227 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268245460': (SNPchiMp v.3:CHI1.0) 5:92356456 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268247005': (SNPch

There are 70 SNP with multiple mapping positions


And are they any SNP with no chromosome positions (unmapped)?

In [17]:
noton = list([name, location, snp] for name, location, snp in find_IGGC_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/GOA/BUILD152/ds_chNotOn.xml.gz"))
print(f"There are {len(noton)} SNP with no mapping positions")

Locations differ for 'rs268233477': (SNPchiMp v.3:CHI1.0) 9:47141083 [A/C] <> (dbSNP151:CHI1.0) 0:0 [A/C]
Locations differ for 'rs268234405': (SNPchiMp v.3:CHI1.0) X:49874736 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268236164': (SNPchiMp v.3:CHI1.0) 12:30613418 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268236323': (SNPchiMp v.3:CHI1.0) X:56494934 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268239094': (SNPchiMp v.3:CHI1.0) 13:10594982 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268240691': (SNPchiMp v.3:CHI1.0) X:69206854 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268243362': (SNPchiMp v.3:CHI1.0) 4:97644567 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268244618': (SNPchiMp v.3:CHI1.0) 1:27102919 [A/C] <> (dbSNP151:CHI1.0) 0:0 [A/C]
Locations differ for 'rs268247290': (SNPchiMp v.3:CHI1.0) 29:44983365 [A/C] <> (dbSNP151:CHI1.0) 0:0 [A/C]
Locations differ for 'rs268252073': (SNPchi

There are 44 SNP with no mapping positions


There are SNPs in unknown chromosomes:

In [18]:
unknown = list([name, location, snp] for name, location, snp in find_IGGC_SNPs(filename="/home/cozzip/SNPchimp/dbSNP/GOA/BUILD152/ds_chUn.xml.gz"))
print(f"There are {len(unknown)} SNP with unknown positions")

Locations differ for 'rs268289076': (SNPchiMp v.3:CHI1.0) X:7799714 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268291307': (SNPchiMp v.3:CHI1.0) X:76324533 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]
Locations differ for 'rs268291954': (SNPchiMp v.3:CHI1.0) X:65611494 [A/G] <> (dbSNP151:CHI1.0) 0:0 [A/G]


There are 1401 SNP with unknown positions
