# Goat chip aligned
I've aligned with megablast the goat chip propbest to the oldest `CHIR1.0` assembly. I want to compare my results with result I get from dbSNP

In [1]:
from collections import defaultdict

from src.features.smarterdb import global_connection, VariantGoat
from src.data.common import AssemblyConf

import pandas as pd

In [2]:
conn = global_connection()
dbSNP152 = AssemblyConf(version="CHI1.0", imported_from="dbSNP152")

First, get my data aligned and set `snp_name` as a index:

In [3]:
results = pd.read_csv("Goat_IGGC_65K_v2_15069617X365016_A2.csv-GCA_000317765.1_CHIR_1.0_genomic.fna.blastn.csv")
results.set_index("snp_name", inplace=True)
results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1_101941444_AF-PAKI,1,100567497,C/T,T/C,T/C,BOT,forward,C,T
1_10408764_AF-PAKI,1,11183359,C/T,T/C,T/C,BOT,forward,T,C
1_104453302_AF-PAKI,0,0,,A/G,,TOP,,,
1_107080965_AF-PAKI,0,0,,A/G,,TOP,,,
1_109839943_AF-PAKI,1,108210531,C/T,T/C,T/C,BOT,forward,T,C


Next load errors: If I can't place a SNP on a chromosome, I will have no position in results table and I will have a reason in this table:

In [4]:
errors = pd.read_csv("Goat_IGGC_65K_v2_15069617X365016_A2.csv-GCA_000317765.1_CHIR_1.0_genomic.fna.blastn.err")
errors.set_index("snp_name", inplace=True)
errors.head()

Unnamed: 0_level_0,illumina,illumina_strand,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_104453302_AF-PAKI,A/G,TOP,No valid alignments after filtering
1_107080965_AF-PAKI,A/G,TOP,Too many alignments after filtering
1_117945786_AF-PAKI,T/C,BOT,No valid alignments after filtering
1_137034442_AF-PAKI,T/C,BOT,No valid alignments after filtering
1_3368511_RH-map,T/G,BOT,No valid alignments after filtering


Ok, now get my Goat variants and focus on *NCBI* data: I could have more variants than *NCBI* if there are probes more recent than dbSNP152:

In [5]:
ncbi_variants = VariantGoat.objects.filter(locations__match=dbSNP152._asdict())
ncbi_variants.count()

53347

Ok now extract dbSNP locations from my `ncbi_variants`:

In [6]:
tmp = defaultdict(list)

for variant in ncbi_variants:
    location = variant.get_location(**dbSNP152._asdict())
    tmp["snp_name"].append(variant.name)
    tmp["rs_id"].append(",".join(variant.rs_id))
    tmp["ncbi_chrom"].append(location.chrom)
    tmp["ncbi_position"].append(location.position)
    
    
ncbi_locations = pd.DataFrame.from_dict(tmp)
ncbi_locations.set_index('snp_name', inplace=True)
ncbi_locations.head()

Unnamed: 0_level_0,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
snp1-scaffold1-2170,rs268233143,22,27222753
snp1-scaffold708-1421224,rs268293133,14,90885671
snp10-scaffold1-352655,rs268233152,22,26872268
snp1000-scaffold1026-533890,rs268291433,8,68958341
snp10000-scaffold1356-652219,rs268242876,7,50027003


Ok, merge these data in a new dataframe. Get rid of SNPs not in *NCBI*, mind that since `ncbi_position` could have *NA* values, need to be converted as *integer*:

In [7]:
tmp = results.merge(ncbi_locations, how="left", on="snp_name")
ncbi_results = tmp[tmp['ncbi_chrom'].isna() == False].astype({'ncbi_position':'int'})
ncbi_results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp1-scaffold1-2170,22,27222753,G/T,A/C,T/G,TOP,reverse,T,G,rs268233143,22,27222753
snp1-scaffold708-1421224,14,90885671,A/G,T/C,A/G,BOT,reverse,G,A,rs268293133,14,90885671
snp10-scaffold1-352655,22,26872268,C/T,A/G,T/C,TOP,reverse,C,T,rs268233152,22,26872268
snp1000-scaffold1026-533890,8,68958341,C/T,A/G,T/C,TOP,reverse,C,T,rs268291433,8,68958341
snp10000-scaffold1356-652219,7,50027003,C/T,A/G,T/C,TOP,reverse,C,T,rs268242876,7,50027003


Ok focus on the differences between my alignment and NCBI:

In [8]:
differences = ncbi_results.query("chrom != ncbi_chrom | position != ncbi_position")
differences.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp10058-scaffold1356-3300452,0,0,,T/C,,BOT,,,,rs268242932,7,47378770
snp10134-scaffold1361-15149,scaffold1361,15149,C/T,T/C,T/C,BOT,forward,C,T,rs268243007,0,0
snp10135-scaffold1361-44576,scaffold1361,44576,A/G,A/G,A/G,TOP,forward,G,A,rs268243008,0,0
snp10136-scaffold1361-91495,scaffold1361,91495,C/T,T/C,T/C,BOT,forward,T,C,rs268243009,0,0
snp10146-scaffold1362-451932,0,0,,T/C,,BOT,,,,rs268243018,23,25326839


In [9]:
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1490 entries, snp10058-scaffold1356-3300452 to snp9688-scaffold1349-11048
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   chrom             1490 non-null   object
 1   position          1490 non-null   int64 
 2   alleles           1387 non-null   object
 3   illumina          1490 non-null   object
 4   illumina_forward  1387 non-null   object
 5   illumina_strand   1490 non-null   object
 6   strand            1387 non-null   object
 7   ref               1387 non-null   object
 8   alt               1387 non-null   object
 9   rs_id             1490 non-null   object
 10  ncbi_chrom        1490 non-null   object
 11  ncbi_position     1490 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 151.3+ KB


I have ~1500 differences between NCBI, some of them are on scaffolds. Let's focus on different chromosome types:

In [10]:
differences["chrom"].value_counts()

0                103
scaffold398       40
scaffold1221      37
scaffold280       29
scaffold428       25
                ... 
scaffold17692      1
scaffold8909       1
scaffold4239       1
scaffold2906       1
scaffold4300       1
Name: chrom, Length: 599, dtype: int64

Ok, tell me how many SNPs I can't place, while *NCBI* can:

In [11]:
not_placed = differences[differences["chrom"] == '0']
print(f"There are {not_placed.shape[0]} SNPs that I can't map to genome")
not_placed.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 103 SNPs that I can't map to genome


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
snp35134-scaffold420-1473256,0,0,,A/C,,TOP,,,,rs268267259,7,62824842,Allele doesn't match to reference
snp22554-scaffold2231-222589,0,0,,A/C,,TOP,,,,rs268255006,29,21348390,Allele doesn't match to reference
snp2311-scaffold107-898972,0,0,,A/C,,TOP,,,,rs268235378,5,75925245,Allele doesn't match to reference
snp25036-scaffold258-708362,0,0,,T/C,,BOT,,,,rs268257421,21,5751513,Allele doesn't match to reference
snp25134-scaffold259-2529688,0,0,,A/G,,TOP,,,,rs268257516,10,57055305,Allele doesn't match to reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...
snp21086-scaffold2064-17345,0,0,,A/G,,TOP,,,,rs268253581,24,33218491,Too many alignments after filtering
snp20280-scaffold2008-130142,0,0,,A/G,,TOP,,,,rs268252794,24,130142,Too many alignments after filtering
snp20279-scaffold2008-65783,0,0,,T/C,,BOT,,,,rs268292025,24,65783,Too many alignments after filtering
snp25730-scaffold265-1002523,0,0,,T/C,,BOT,,,,rs268258099,13,5998258,Too many alignments after filtering


Are then any *SNP* which I map to a different position than ncbi??

In [12]:
different = differences.query("chrom != '0' and ncbi_chrom != '0'")
print(f"There are {different.shape[0]} SNPs that I can map to a different position")
different.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 23 SNPs that I can map to a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
snp11387-scaffold1412-819858,20,64931642,C/T,T/C,T/C,BOT,forward,T,C,rs268244224,9,4980459,
snp12720-scaffold1489-261736,2,75886910,A/G,A/G,A/G,TOP,forward,A,G,rs268245515,6,750543,
snp1383-scaffold1038-1302154,19,2141759,A/G,A/G,A/G,TOP,forward,G,A,rs268234480,12,82532879,
snp13939-scaffold1546-56443,scaffold1546,56443,C/T,T/C,T/C,BOT,forward,C,T,rs268246703,28,8945165,
snp13967-scaffold155-43026,scaffold155,43026,C/T,T/C,T/C,BOT,forward,T,C,rs268246731,4,45122153,
snp18748-scaffold19-612874,X,68490467,A/G,T/C,A/G,BOT,reverse,G,A,rs268251339,2,13366147,
snp18789-scaffold19-4948313,X,64155028,A/G,T/C,A/G,BOT,reverse,G,A,rs268291960,2,2811210,
snp18799-scaffold19-5810951,X,63292390,C/T,A/G,T/C,TOP,reverse,C,T,rs268251371,11,57505834,
snp18804-scaffold19-6240051,X,62863290,A/C,T/G,A/C,BOT,reverse,A,C,rs268291970,27,3981284,
snp18805-scaffold19-6334113,X,62769228,G/T,A/C,T/G,TOP,reverse,T,G,rs268291971,X,14645425,


How many SNP I can map to a scaffold? Does NCBI map them in a different position? Yes, they are a subset of the previous query:

In [13]:
scaffolds = differences[differences["chrom"].str.startswith('scaffold')]
query = scaffolds[scaffolds["ncbi_chrom"] != '0']
print(f"There are {query.shape[0]} SNPs that I can map to a scaffold, while NCBI map them in a different position")
query

There are 4 SNPs that I can map to a scaffold, while NCBI map them in a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp13939-scaffold1546-56443,scaffold1546,56443,C/T,T/C,T/C,BOT,forward,C,T,rs268246703,28,8945165
snp13967-scaffold155-43026,scaffold155,43026,C/T,T/C,T/C,BOT,forward,T,C,rs268246731,4,45122153
snp35357-scaffold4246-28158,scaffold4246,28158,C/T,T/C,T/C,BOT,forward,T,C,rs268267476,16,71363317
snp6032-scaffold1211-25168,scaffold1211,25168,A/G,A/G,A/G,TOP,forward,A,G,rs268239009,7,28487917
