# Sheep Illumina chip aligned
I've aligned with megablast the sheep chip probeset to the oldest `OAR4` assembly. I want to compare my results with result I get from dbSNP

In [1]:
from collections import defaultdict

from src.features.smarterdb import global_connection, VariantSheep
from src.data.common import AssemblyConf

import pandas as pd

In [2]:
conn = global_connection()
dbSNP152 = AssemblyConf(version="Oar_v4.0", imported_from="dbSNP152")

First, get my data aligned and set `snp_name` as a index:

In [3]:
results = pd.read_csv("ovinesnp50-genome-assembly-oar-v4-0.csv-GCA_000298735.2_Oar_v4.0_genomic.fna.blastn.csv")
results.set_index("snp_name", inplace=True)
results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
250506CS3900065000002_1238.1,15,5859890,C/T,T/C,T/C,BOT,forward,C,T
250506CS3900140500001_312.1,23,26243215,C/T,T/C,T/C,BOT,forward,C,T
250506CS3900176800001_906.1,7,81590897,C/T,T/C,T/C,BOT,forward,C,T
250506CS3900211600001_1041.1,16,41363310,G/T,T/G,T/G,BOT,forward,G,T
250506CS3900218700001_1294.1,2,148834939,C/T,T/C,T/C,BOT,forward,C,T


Next load errors: If I can't place a SNP on a chromosome, I will have no position in results table and I will have a reason in this table:

In [4]:
errors = pd.read_csv("ovinesnp50-genome-assembly-oar-v4-0.csv-GCA_000298735.2_Oar_v4.0_genomic.fna.blastn.err")
errors.set_index("snp_name", inplace=True)
errors.head()

Unnamed: 0_level_0,illumina,illumina_strand,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
250506CS3900371000001_1255.1,T/C,BOT,Cannot determine a unique position for SNP T/C...
CR_594.1,T/C,BOT,Too many alignments after filtering
CR_816.1,T/C,BOT,Too many alignments after filtering
CytB_1406.1,T/C,BOT,No valid alignments after filtering
CytB_1505.1,A/G,TOP,No valid alignments after filtering


Ok, now get my Sheep variants and focus on *NCBI* data: I could have more variants than *NCBI* if there are probes more recent than dbSNP152:

In [5]:
ncbi_variants = VariantSheep.objects.filter(chip_name="IlluminaOvineSNP50", locations__match=dbSNP152._asdict(), rs_id__exists=True)
ncbi_variants.count()

48224

Ok now extract dbSNP locations from my `ncbi_variants`:

In [6]:
tmp = defaultdict(list)

for variant in ncbi_variants:
    location = variant.get_location(**dbSNP152._asdict())
    tmp["snp_name"].append(variant.name)
    tmp["rs_id"].append(",".join(variant.rs_id))
    tmp["ncbi_chrom"].append(location.chrom)
    tmp["ncbi_position"].append(location.position)
    
    
ncbi_locations = pd.DataFrame.from_dict(tmp)
ncbi_locations.set_index('snp_name', inplace=True)
ncbi_locations.head()

Unnamed: 0_level_0,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
OAR16_74703566.1,"rs406297509,rs1087899539",16,68777502
OAR5_38485386.1,"rs402039066,rs1093088087",5,34727924
OAR5_87409839_X.1,rs119102657,5,79472633
250506CS3900414400001_1178.1,rs119102699,1,103285485
s05189.1,rs159412897,1,121010442


Ok, merge these data in a new dataframe. Get rid of SNPs not in *NCBI*, mind that since `ncbi_position` could have *NA* values, need to be converted as *integer*:

In [7]:
tmp = results.merge(ncbi_locations, how="left", on="snp_name")
ncbi_results = tmp[tmp['ncbi_chrom'].isna() == False].astype({'ncbi_position':'int'})
ncbi_results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
250506CS3900065000002_1238.1,15,5859890,C/T,T/C,T/C,BOT,forward,C,T,rs55630613,15,5859890
250506CS3900140500001_312.1,23,26243215,C/T,T/C,T/C,BOT,forward,C,T,rs55630642,23,26243215
250506CS3900176800001_906.1,7,81590897,C/T,T/C,T/C,BOT,forward,C,T,rs55630654,7,81590897
250506CS3900211600001_1041.1,16,41363310,G/T,T/G,T/G,BOT,forward,G,T,rs55630658,16,41363310
250506CS3900218700001_1294.1,2,148834939,C/T,T/C,T/C,BOT,forward,C,T,rs55630663,2,148834939


Ok focus on the differences between my alignment and NCBI:

In [8]:
differences = ncbi_results.query("chrom != ncbi_chrom | position != ncbi_position")
differences.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
250506CS3900371000001_1255.1,0,0,,T/C,,BOT,,,,rs417377113,11,35291132
DU172264_319.1,25,20596182,A/G,T/C,A/G,BOT,reverse,G,A,rs55632153,25,20596183
DU175804_598.1,0,0,,T/C,,BOT,,,,rs409850824,13,12526490
DU178311_404.1,0,0,,T/C,,BOT,,,,rs55631803,6,36768153
DU179070_177.1,0,0,,A/G,,TOP,,,,rs55628106,1,111700859


In [9]:
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3626 entries, 250506CS3900371000001_1255.1 to s75909.1
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   chrom             3626 non-null   object
 1   position          3626 non-null   int64 
 2   alleles           582 non-null    object
 3   illumina          3626 non-null   object
 4   illumina_forward  582 non-null    object
 5   illumina_strand   3626 non-null   object
 6   strand            582 non-null    object
 7   ref               582 non-null    object
 8   alt               582 non-null    object
 9   rs_id             3626 non-null   object
 10  ncbi_chrom        3626 non-null   object
 11  ncbi_position     3626 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 368.3+ KB


I have ~3600 differences between NCBI, let's focus on different chromosome types:

In [10]:
differences["chrom"].value_counts()

0                 3044
1                   98
2                   98
3                   95
X                   28
6                   19
4                   19
13                  18
9                   18
5                   17
18                  17
19                  14
10                  11
11                  11
12                  10
14                  10
7                   10
22                  10
16                   9
26                   9
15                   9
20                   8
8                    8
24                   8
23                   7
17                   7
25                   7
21                   6
AMGL02044162.1       1
Name: chrom, dtype: int64

Ok, tell me how many SNPs I can't place, while *NCBI* can:

In [11]:
not_placed = differences[differences["chrom"] == '0']
print(f"There are {not_placed.shape[0]} SNPs that I can't map to genome")
not_placed.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 3044 SNPs that I can't map to genome


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
s42617.1,0,0,,A/G,,TOP,,,,rs416057465,21,8471571,Allele doesn't match to reference
OAR8_57948794.1,0,0,,T/C,,BOT,,,,rs408606108,8,54001673,Allele doesn't match to reference
OAR3_104223609.1,0,0,,A/C,,TOP,,,,rs426632185,3,97812165,Allele doesn't match to reference
OAR8_86082249.1,0,0,,T/C,,BOT,,,,rs426732819,8,79765523,Allele doesn't match to reference
s53053.1,0,0,,T/C,,BOT,,,,rs404810128,5,55369772,Allele doesn't match to reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...
OAR1_108911481.1,0,0,,A/G,,TOP,,,,rs426253458,1,101365237,Too many alignments after filtering
OAR10_52490243.1,0,0,,T/C,,BOT,,,,rs413505848,10,51482563,Too many alignments after filtering
OAR1_185135450.1,0,0,,A/G,,TOP,,,,rs415780145,1,171523306,Too many alignments after filtering
s34796.1,0,0,,A/C,,TOP,,,,rs421565268,18,67809480,Too many alignments after filtering


Well, there are a lot of SNPs I cannot match. Group them by reason:

In [12]:
not_placed.merge(errors["reason"], how="left", on="snp_name")["reason"].value_counts()

No valid alignments after filtering                    2907
Allele doesn't match to reference                        72
Too many alignments after filtering                      39
Cannot determine a unique position for SNP A/G (61)       5
Cannot determine a unique position for SNP T/C (60)       4
Cannot determine a unique position for SNP T/G (60)       3
Cannot determine a unique position for SNP A/G (60)       3
Cannot determine a unique position for SNP T/C (61)       3
Cannot determine a unique position for SNP A/C (60)       2
Cannot determine a unique position for SNP T/G (61)       2
Cannot determine a unique position for SNP T/C (63)       1
Cannot determine a unique position for SNP A/G (62)       1
Cannot determine a unique position for SNP A/C (61)       1
Cannot determine a unique position for SNP A/C (59)       1
Name: reason, dtype: int64

In [13]:
tmp = not_placed.merge(errors["reason"], how="left", on="snp_name")
tmp[tmp["reason"] == "No valid alignments after filtering"]

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DU178311_404.1,0,0,,T/C,,BOT,,,,rs55631803,6,36768153,No valid alignments after filtering
DU179070_177.1,0,0,,A/G,,TOP,,,,rs55628106,1,111700859,No valid alignments after filtering
DU189586_521.1,0,0,,A/G,,TOP,,,,rs405647639,X,64170493,No valid alignments after filtering
DU191809_420.1,0,0,,T/C,,BOT,,,,"rs428367938,rs405711682",1,186920065,No valid alignments after filtering
DU192841_628.1,0,0,,T/C,,BOT,,,,rs55632389,23,19282729,No valid alignments after filtering
...,...,...,...,...,...,...,...,...,...,...,...,...,...
s75759.1,0,0,,T/C,,BOT,,,,rs429941367,6,108740789,No valid alignments after filtering
s75799.1,0,0,,T/C,,BOT,,,,rs400816683,21,31837963,No valid alignments after filtering
s75819.1,0,0,,T/C,,BOT,,,,rs413852725,22,23316563,No valid alignments after filtering
s75898.1,0,0,,T/C,,BOT,,,,rs410893602,5,33620418,No valid alignments after filtering


Are then any *SNP* which I map to a different position than ncbi??

In [14]:
different = differences.query("chrom != '0' and ncbi_chrom != '0'")
print(f"There are {different.shape[0]} SNPs that I can map to a different position")
different.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 571 SNPs that I can map to a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DU172264_319.1,25,20596182,A/G,T/C,A/G,BOT,reverse,G,A,rs55632153,25,20596183,
DU206996_498.1,5,33118535,A/C,T/G,A/C,BOT,reverse,A,C,"rs403872294,rs421000549",5,33118534,
DU240765_244.1,X,121221683,C/T,A/G,T/C,TOP,reverse,C,T,rs55630244,X,121221682,
DU266660_200.1,2,119804090,C/T,A/G,T/C,TOP,reverse,C,T,rs55630890,2,119804089,
DU286106_170.1,26,3412784,C/T,A/G,T/C,TOP,reverse,T,C,rs55628429,26,3412783,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
s74894.1,19,52486395,A/G,T/C,A/G,BOT,reverse,A,G,rs419340186,19,52486393,
s75063.1,7,97723185,C/T,A/G,T/C,TOP,reverse,C,T,rs402289455,7,97723186,
s75140.1,1,190436466,A/G,T/C,A/G,BOT,reverse,A,G,rs411885717,1,190436464,
s75196.1,12,50493324,C/T,A/G,T/C,TOP,reverse,C,T,rs416182289,12,50493323,
