# Sheep Affymetrix chip aligned
I've aligned with megablast the affymetrix sheep chip probeset to the oldest `OAR4` assembly. I want to compare my results with result I get from dbSNP

In [1]:
from collections import defaultdict

from src.features.smarterdb import global_connection, VariantSheep
from src.data.common import AssemblyConf

import pandas as pd

In [2]:
conn = global_connection()
dbSNP152 = AssemblyConf(version="Oar_v4.0", imported_from="dbSNP152")

First, get my data aligned and set `snp_name` as a index:

In [3]:
results = pd.read_csv("Axiom_Ovi_Can.na35.r3.a3.annot.csv-GCA_000298735.2_Oar_v4.0_genomic.fna.blastn.csv")
results.set_index("snp_name", inplace=True)
results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Affx-293815543,0,0,,T/C,,BOT,,,
Affx-139979198,0,0,,T/G,,BOT,,,
Affx-139969918,0,0,,C/G,,TOP,,,
Affx-139932950,0,0,,T/C,,BOT,,,
Affx-139939859,0,0,,A/G,,TOP,,,


Next load errors: If I can't place a SNP on a chromosome, I will have no position in results table and I will have a reason in this table:

In [4]:
errors = pd.read_csv("Axiom_Ovi_Can.na35.r3.a3.annot.csv-GCA_000298735.2_Oar_v4.0_genomic.fna.blastn.err")
errors.set_index("snp_name", inplace=True)
errors.head()

Unnamed: 0_level_0,illumina,illumina_strand,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Affx-293815543,T/C,BOT,No valid alignments after filtering
Affx-139979198,T/G,BOT,No valid alignments after filtering
Affx-139969918,C/G,TOP,No valid alignments after filtering
Affx-139932950,T/C,BOT,No valid alignments after filtering
Affx-139939859,A/G,TOP,No valid alignments after filtering


Ok, now get my Sheep variants and focus on *NCBI* data: I could have more variants than *NCBI* if there are probes more recent than dbSNP152:

In [5]:
ncbi_variants = VariantSheep.objects.filter(chip_name="AffymetrixAxiomOviCan", locations__match=dbSNP152._asdict(), rs_id__exists=True)
ncbi_variants.count()

39105

Ok now extract dbSNP locations from my `ncbi_variants`:

In [6]:
tmp = defaultdict(list)

for variant in ncbi_variants:
    location = variant.get_location(**dbSNP152._asdict())
    tmp["snp_name"].append(variant.affy_snp_id)
    tmp["rs_id"].append(",".join(variant.rs_id))
    tmp["ncbi_chrom"].append(location.chrom)
    tmp["ncbi_position"].append(location.position)
    
    
ncbi_locations = pd.DataFrame.from_dict(tmp)
ncbi_locations.set_index('snp_name', inplace=True)
ncbi_locations.head()

Unnamed: 0_level_0,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Affx-256854517,rs10721113,18,64294536
Affx-122852950,"rs406297509,rs1087899539",16,68777502
Affx-122806470,"rs402039066,rs1093088087",5,34727924
Affx-122839502,rs119102699,1,103285485
Affx-122821645,rs159412897,1,121010442


Ok, merge these data in a new dataframe. Get rid of SNPs not in *NCBI*, mind that since `ncbi_position` could have *NA* values, need to be converted as *integer*:

In [7]:
tmp = results.merge(ncbi_locations, how="left", on="snp_name")
ncbi_results = tmp[tmp['ncbi_chrom'].isna() == False].astype({'ncbi_position':'int'})
ncbi_results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Affx-122847494,3,30880580,A/G,A/G,A/G,TOP,forward,A,G,rs424489686,3,30880580
Affx-122829181,2,219365951,A/G,A/G,A/G,TOP,forward,G,A,rs401909860,2,219365951
Affx-122816720,1,120533735,A/G,A/G,A/G,TOP,forward,A,G,rs398687222,1,120533735
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130
Affx-122814061,1,4556384,C/T,T/C,T/C,BOT,forward,C,T,rs55630584,1,4556384


Ok focus on the differences between my alignment and NCBI:

In [8]:
differences = ncbi_results.query("chrom != ncbi_chrom | position != ncbi_position")
differences.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130
Affx-122826280,0,0,,T/C,,BOT,,,,rs421131731,1,159615082
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130
Affx-122859209,0,0,,T/C,,BOT,,,,rs418973572,12,44759238
Affx-122859207,0,0,,A/C,,TOP,,,,rs418656823,21,40958663


In [9]:
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6135 entries, Affx-122808678 to Affx-122805046
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   chrom             6135 non-null   object
 1   position          6135 non-null   int64 
 2   alleles           32 non-null     object
 3   illumina          6135 non-null   object
 4   illumina_forward  32 non-null     object
 5   illumina_strand   6135 non-null   object
 6   strand            32 non-null     object
 7   ref               32 non-null     object
 8   alt               32 non-null     object
 9   rs_id             6135 non-null   object
 10  ncbi_chrom        6135 non-null   object
 11  ncbi_position     6135 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 623.1+ KB


I have ~6000 differences between NCBI, let's focus on different chromosome types:

In [10]:
differences["chrom"].value_counts()

0                 6103
5                    4
2                    4
13                   3
11                   2
16                   2
18                   2
26                   2
24                   2
4                    2
1                    2
3                    2
21                   1
AMGL02043384.1       1
19                   1
6                    1
8                    1
Name: chrom, dtype: int64

Ok, tell me how many SNPs I can't place, while *NCBI* can:

In [11]:
not_placed = differences[differences["chrom"] == '0']
print(f"There are {not_placed.shape[0]} SNPs that I can't map to genome")
not_placed.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 6103 SNPs that I can't map to genome


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Affx-122856522,0,0,,A/G,,TOP,,,,rs428537703,3,21069340,Allele doesn't match to reference
Affx-122828089,0,0,,A/C,,TOP,,,,rs402974360,3,16674065,Allele doesn't match to reference
Affx-122828089,0,0,,A/C,,TOP,,,,rs402974360,3,16674065,Allele doesn't match to reference
Affx-122857399,0,0,,T/C,,BOT,,,,rs428522734,4,34062779,Allele doesn't match to reference
Affx-122846606,0,0,,T/C,,BOT,,,,rs402456023,3,138263177,Allele doesn't match to reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Affx-122828030,0,0,,T/C,,BOT,,,,rs426199307,26,11423020,Too many alignments after filtering
Affx-122854474,0,0,,T/G,,BOT,,,,rs419149720,5,76839248,Too many alignments after filtering
Affx-122818977,0,0,,A/G,,TOP,,,,rs412796743,2,238241826,Too many alignments after filtering
Affx-122856442,0,0,,A/G,,TOP,,,,rs415266032,1,225134615,Too many alignments after filtering


Well, there are a lot of SNPs I cannot match. Group them by reason:

In [12]:
not_placed.merge(errors["reason"], how="left", on="snp_name")["reason"].value_counts()

No valid alignments after filtering    6932
Allele doesn't match to reference        45
Too many alignments after filtering      44
Name: reason, dtype: int64

In [13]:
tmp = not_placed.merge(errors["reason"], how="left", on="snp_name")
tmp[tmp["reason"] == "No valid alignments after filtering"]

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130,No valid alignments after filtering
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130,No valid alignments after filtering
Affx-122826280,0,0,,T/C,,BOT,,,,rs421131731,1,159615082,No valid alignments after filtering
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130,No valid alignments after filtering
Affx-122808678,0,0,,T/C,,BOT,,,,rs415806402,16,22212130,No valid alignments after filtering
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Affx-122805190,0,0,,T/G,,BOT,,,,rs417635596,1,148823820,No valid alignments after filtering
Affx-122805160,0,0,,A/G,,TOP,,,,rs405555407,2,170857298,No valid alignments after filtering
Affx-122805160,0,0,,A/G,,TOP,,,,rs405555407,2,170857298,No valid alignments after filtering
Affx-122805107,0,0,,T/C,,BOT,,,,rs409682103,6,96607286,No valid alignments after filtering


Are then any *SNP* which I map to a different position than ncbi??

In [14]:
different = differences.query("chrom != '0' and ncbi_chrom != '0'")
print(f"There are {different.shape[0]} SNPs that I can map to a different position")
different.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 27 SNPs that I can map to a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Affx-122852870,4,103901630,C/T,T/C,T/C,BOT,forward,T,C,rs412596792,4,103901753,
Affx-122848400,8,38497065,C/T,T/C,T/C,BOT,forward,T,C,"rs417617472,rs421866224",8,38497064,
Affx-122846978,3,19932892,A/G,T/C,A/G,BOT,reverse,G,A,rs413817657,3,19932658,
Affx-122845290,1,15134526,A/G,A/G,A/G,TOP,forward,A,G,rs424329901,1,15134525,
Affx-122844107,4,5551297,A/G,A/G,A/G,TOP,forward,A,G,rs417507408,4,5551154,
Affx-122839909,18,1045231,C/G,G/C,C/G,BOT,reverse,C,G,rs418781790,18,1050724,
Affx-122835219,2,131930488,C/T,T/C,T/C,BOT,forward,C,T,"rs413523273,rs417777911",2,131930487,
Affx-122834085,13,23871319,C/T,T/C,T/C,BOT,forward,T,C,rs417731167,13,23871320,
Affx-122834071,11,52254468,G/T,T/G,T/G,BOT,forward,G,T,"rs405212442,rs604907159",11,52254467,
Affx-122832972,19,28786029,A/G,A/G,A/G,TOP,forward,A,G,rs411941882,19,28786157,
