# Goat chip aligned
## CHIR1.0
I've aligned with megablast the goat chip propbest to the oldest `CHIR1.0` assembly. I want to compare my results with result I get from dbSNP

In [1]:
import os
import pymongo
import pandas as pd

from dotenv import find_dotenv, load_dotenv
from pymongo import MongoClient
from pymongoarrow.monkey import patch_all
from pymongoarrow.api import Schema

In [2]:
load_dotenv(find_dotenv())
patch_all()

First, get my data aligned and set `snp_name` as a index:

In [3]:
results = pd.read_csv("Goat_IGGC_65K_v2_15069617X365016_A2.csv-GCA_000317765.1_CHIR_1.0_genomic.fna.blastn.csv")
results.set_index("snp_name", inplace=True)
results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1_101941444_AF-PAKI,1,100567497,C/T,T/C,T/C,BOT,forward,C,T
1_10408764_AF-PAKI,1,11183359,C/T,T/C,T/C,BOT,forward,T,C
1_104453302_AF-PAKI,0,0,,A/G,,TOP,,,
1_107080965_AF-PAKI,0,0,,A/G,,TOP,,,
1_109839943_AF-PAKI,1,108210531,C/T,T/C,T/C,BOT,forward,T,C


Next load errors: If I can't place a SNP on a chromosome, I will have no position in results table and I will have a reason in this table:

In [4]:
errors = pd.read_csv("Goat_IGGC_65K_v2_15069617X365016_A2.csv-GCA_000317765.1_CHIR_1.0_genomic.fna.blastn.err")
errors.set_index("snp_name", inplace=True)
errors.head()

Unnamed: 0_level_0,illumina,illumina_strand,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_104453302_AF-PAKI,A/G,TOP,No valid alignments after filtering
1_107080965_AF-PAKI,A/G,TOP,Too many alignments after filtering
1_117945786_AF-PAKI,T/C,BOT,No valid alignments after filtering
1_137034442_AF-PAKI,T/C,BOT,No valid alignments after filtering
1_3368511_RH-map,T/G,BOT,No valid alignments after filtering


Ok, now get my Goat variants and focus on *NCBI* data: I could have more variants than *NCBI* if there are probes more recent than dbSNP152. Using `pymongoarrow` to collect data, first connect to database and get a collection:

In [5]:
conn = MongoClient(
    'mongodb://localhost:27017/',
    username=os.getenv("MONGODB_SMARTER_USER"),
    password=os.getenv("MONGODB_SMARTER_PASS")
)
smarter = conn['smarter']
variantGoat = smarter['variantGoat']

Now define a *MongoDB* pipeline which collect and transform data in the simplest way:

In [6]:
def get_pipeline(imported_from: str, version: str, columns: list[str]):
    return [
        # match the SNPs I want
        {"$match": {
            "locations": {"$elemMatch": {"version": version, "imported_from": imported_from}}
        }},
        # now limit the fields I need
        {"$project": {
            "snp_name": "$name",
            # this will join a list of strings, like ",".join(list)
            "rs_id": {
                "$reduce": {
                    "input": "$rs_id", 
                    "initialValue": "", 
                    "in": {
                        "$concat": [
                            "$$value", 
                            {'$cond': [{'$eq': ['$$value', '']}, '', ', ']}, 
                            "$$this"
                        ]
                    }
                }
            },
            # this is how to do an $elemMatch in a projection step of a pipeline
            "locations": {
                "$filter": {
                    "input": "$locations", 
                    "as": "location", 
                    "cond": {
                        "$and": [
                            {"$eq": ["$$location.imported_from", imported_from]}, 
                            {"$eq": ["$$location.version", version]}
                        ]
                    }
                }
            }
        }},
        # attempt to simplify locations, get a row for each item of array (unpack the only item)
        {"$unwind": "$locations"}, 
        # track the fields I'm interested
        {"$set": {
            columns[0]: "$locations.chrom", 
            columns[1]: "$locations.position"
        }},
        # remove the field I don't want
        {"$unset": "locations"}
    ]

Ok, now get my Goat variants and focus on *NCBI* data: I could have more variants than *NCBI* if there are probes more recent than dbSNP152:

In [7]:
pipeline = get_pipeline(imported_from="dbSNP152", version="CHI1.0", columns=["ncbi_chrom", "ncbi_position"])
schema = Schema({"snp_name": str, "rs_id": str, "ncbi_chrom": str, "ncbi_position": int})

ncbi_locations = variantGoat.aggregate_pandas_all(pipeline, schema=schema)
ncbi_locations.set_index('snp_name', inplace=True)
ncbi_locations.head()

Unnamed: 0_level_0,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
snp1-scaffold1-2170,rs268233143,22,27222753
snp1-scaffold708-1421224,rs268293133,14,90885671
snp10-scaffold1-352655,rs268233152,22,26872268
snp1000-scaffold1026-533890,rs268291433,8,68958341
snp10000-scaffold1356-652219,rs268242876,7,50027003


Ok, merge these data in a new dataframe. Get rid of SNPs not in *NCBI*, mind that since `ncbi_position` could have *NA* values, need to be converted as *integer*:

In [8]:
tmp = results.merge(ncbi_locations, how="left", on="snp_name")
ncbi_results = tmp[tmp['ncbi_chrom'].isna() == False].astype({'ncbi_position':'int'})
ncbi_results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp1-scaffold1-2170,22,27222753,G/T,A/C,T/G,TOP,reverse,T,G,rs268233143,22,27222753
snp1-scaffold708-1421224,14,90885671,A/G,T/C,A/G,BOT,reverse,G,A,rs268293133,14,90885671
snp10-scaffold1-352655,22,26872268,C/T,A/G,T/C,TOP,reverse,C,T,rs268233152,22,26872268
snp1000-scaffold1026-533890,8,68958341,C/T,A/G,T/C,TOP,reverse,C,T,rs268291433,8,68958341
snp10000-scaffold1356-652219,7,50027003,C/T,A/G,T/C,TOP,reverse,C,T,rs268242876,7,50027003


Ok focus on the differences between my alignment and NCBI:

In [9]:
differences = ncbi_results.query("chrom != ncbi_chrom | position != ncbi_position")
differences.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp10058-scaffold1356-3300452,0,0,,T/C,,BOT,,,,rs268242932,7,47378770
snp10134-scaffold1361-15149,scaffold1361,15149,C/T,T/C,T/C,BOT,forward,C,T,rs268243007,0,0
snp10135-scaffold1361-44576,scaffold1361,44576,A/G,A/G,A/G,TOP,forward,G,A,rs268243008,0,0
snp10136-scaffold1361-91495,scaffold1361,91495,C/T,T/C,T/C,BOT,forward,T,C,rs268243009,0,0
snp10146-scaffold1362-451932,0,0,,T/C,,BOT,,,,rs268243018,23,25326839


In [10]:
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1490 entries, snp10058-scaffold1356-3300452 to snp9688-scaffold1349-11048
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   chrom             1490 non-null   object
 1   position          1490 non-null   int64 
 2   alleles           1387 non-null   object
 3   illumina          1490 non-null   object
 4   illumina_forward  1387 non-null   object
 5   illumina_strand   1490 non-null   object
 6   strand            1387 non-null   object
 7   ref               1387 non-null   object
 8   alt               1387 non-null   object
 9   rs_id             1490 non-null   object
 10  ncbi_chrom        1490 non-null   object
 11  ncbi_position     1490 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 151.3+ KB


I have ~1500 differences between NCBI, some of them are on scaffolds. Let's focus on different chromosome types:

In [11]:
differences["chrom"].value_counts()

0                103
scaffold398       40
scaffold1221      37
scaffold280       29
scaffold428       25
                ... 
scaffold3484       1
scaffold1548       1
scaffold12878      1
scaffold4399       1
scaffold2955       1
Name: chrom, Length: 599, dtype: int64

Ok, tell me how many SNPs I can't place, while *NCBI* can:

In [12]:
not_placed = differences[differences["chrom"] == '0']
print(f"There are {not_placed.shape[0]} SNPs that I can't map to genome")
not_placed.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 103 SNPs that I can't map to genome


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
snp35134-scaffold420-1473256,0,0,,A/C,,TOP,,,,rs268267259,7,62824842,Allele doesn't match to reference
snp22554-scaffold2231-222589,0,0,,A/C,,TOP,,,,rs268255006,29,21348390,Allele doesn't match to reference
snp2311-scaffold107-898972,0,0,,A/C,,TOP,,,,rs268235378,5,75925245,Allele doesn't match to reference
snp25036-scaffold258-708362,0,0,,T/C,,BOT,,,,rs268257421,21,5751513,Allele doesn't match to reference
snp25134-scaffold259-2529688,0,0,,A/G,,TOP,,,,rs268257516,10,57055305,Allele doesn't match to reference
...,...,...,...,...,...,...,...,...,...,...,...,...,...
snp21086-scaffold2064-17345,0,0,,A/G,,TOP,,,,rs268253581,24,33218491,Too many alignments after filtering
snp20280-scaffold2008-130142,0,0,,A/G,,TOP,,,,rs268252794,24,130142,Too many alignments after filtering
snp20279-scaffold2008-65783,0,0,,T/C,,BOT,,,,rs268292025,24,65783,Too many alignments after filtering
snp25730-scaffold265-1002523,0,0,,T/C,,BOT,,,,rs268258099,13,5998258,Too many alignments after filtering


Here are the SNPs I cannot match. Group them by reason:

In [13]:
not_placed.merge(errors["reason"], how="left", on="snp_name")["reason"].value_counts()

Too many alignments after filtering    61
Allele doesn't match to reference      27
No valid alignments after filtering    15
Name: reason, dtype: int64

Are then any *SNP* which I map to a different position than ncbi??

In [14]:
different = differences.query("chrom != '0' and ncbi_chrom != '0'")
print(f"There are {different.shape[0]} SNPs that I can map to a different position")
different

There are 23 SNPs that I can map to a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp11387-scaffold1412-819858,20,64931642,C/T,T/C,T/C,BOT,forward,T,C,rs268244224,9,4980459
snp12720-scaffold1489-261736,2,75886910,A/G,A/G,A/G,TOP,forward,A,G,rs268245515,6,750543
snp1383-scaffold1038-1302154,19,2141759,A/G,A/G,A/G,TOP,forward,G,A,rs268234480,12,82532879
snp13939-scaffold1546-56443,scaffold1546,56443,C/T,T/C,T/C,BOT,forward,C,T,rs268246703,28,8945165
snp13967-scaffold155-43026,scaffold155,43026,C/T,T/C,T/C,BOT,forward,T,C,rs268246731,4,45122153
snp18748-scaffold19-612874,X,68490467,A/G,T/C,A/G,BOT,reverse,G,A,rs268251339,2,13366147
snp18789-scaffold19-4948313,X,64155028,A/G,T/C,A/G,BOT,reverse,G,A,rs268291960,2,2811210
snp18799-scaffold19-5810951,X,63292390,C/T,A/G,T/C,TOP,reverse,C,T,rs268251371,11,57505834
snp18804-scaffold19-6240051,X,62863290,A/C,T/G,A/C,BOT,reverse,A,C,rs268291970,27,3981284
snp18805-scaffold19-6334113,X,62769228,G/T,A/C,T/G,TOP,reverse,T,G,rs268291971,X,14645425


How many SNP I can map to a scaffold? Does NCBI map them in a different position? Yes, they are a subset of the previous query:

In [15]:
scaffolds = differences[differences["chrom"].str.startswith('scaffold')]
query = scaffolds[scaffolds["ncbi_chrom"] != '0']
print(f"There are {query.shape[0]} SNPs that I can map to a scaffold, while NCBI map them in a different position")
query

There are 4 SNPs that I can map to a scaffold, while NCBI map them in a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp13939-scaffold1546-56443,scaffold1546,56443,C/T,T/C,T/C,BOT,forward,C,T,rs268246703,28,8945165
snp13967-scaffold155-43026,scaffold155,43026,C/T,T/C,T/C,BOT,forward,T,C,rs268246731,4,45122153
snp35357-scaffold4246-28158,scaffold4246,28158,C/T,T/C,T/C,BOT,forward,T,C,rs268267476,16,71363317
snp6032-scaffold1211-25168,scaffold1211,25168,A/G,A/G,A/G,TOP,forward,A,G,rs268239009,7,28487917


## ARS1.2
I've done the alignments for ARS1.2 assembly. Let's compare those data with the data used for SMARTER database, get my data aligned and set `snp_name` as a index:

In [16]:
results = pd.read_csv("Goat_IGGC_65K_v2_15069617X365016_A2.csv-GCA_001704415.2_ARS1.2_genomic.fna.blastn.csv")
results.set_index("snp_name", inplace=True)
results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1_101941444_AF-PAKI,1,101941444,C/T,T/C,T/C,BOT,forward,C,T
1_10408764_AF-PAKI,1,10408764,C/T,T/C,T/C,BOT,forward,C,T
1_104453302_AF-PAKI,1,104453302,A/G,A/G,A/G,TOP,forward,G,A
1_107080965_AF-PAKI,1,107080965,A/G,A/G,A/G,TOP,forward,A,G
1_109839943_AF-PAKI,1,109839943,C/T,T/C,T/C,BOT,forward,C,T


Next load errors: If I can't place a SNP on a chromosome, I will have no position in results table and I will have a reason in this table:

In [17]:
errors = pd.read_csv("Goat_IGGC_65K_v2_15069617X365016_A2.csv-GCA_001704415.2_ARS1.2_genomic.fna.blastn.err")
errors.set_index("snp_name", inplace=True)
errors.head()

Unnamed: 0_level_0,illumina,illumina_strand,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_3570452_RH-map,T/G,BOT,Too many alignments after filtering
1_63835619_AF-PAKI,A/C,TOP,Too many alignments after filtering
10_26570_AF-PAKI,A/G,TOP,Too many alignments after filtering
10_4536087_RH-map,A/G,TOP,Too many alignments after filtering
10_4730976_RH-map,A/C,TOP,Too many alignments after filtering


Ok, now get my Goat variants and focus on *SMARTER* data:

In [18]:
pipeline = get_pipeline(imported_from="manifest", version="ARS1", columns=["smarter_chrom", "smarter_position"])
schema = Schema({"snp_name": str, "smarter_chrom": str, "smarter_position": int})

smarter_locations = variantGoat.aggregate_pandas_all(pipeline, schema=schema)
smarter_locations.set_index('snp_name', inplace=True)
smarter_locations.head()

Unnamed: 0_level_0,smarter_chrom,smarter_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1
1_101941444_AF-PAKI,1,101941444
1_10408764_AF-PAKI,1,10408764
1_104453302_AF-PAKI,1,104453302
1_107080965_AF-PAKI,1,107080965
1_109839943_AF-PAKI,1,109839943


Ok, merge these data in a new dataframe:

In [19]:
smarter_results = results.merge(smarter_locations, how="left", on="snp_name")
smarter_results.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,smarter_chrom,smarter_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1_101941444_AF-PAKI,1,101941444,C/T,T/C,T/C,BOT,forward,C,T,1,101941444
1_10408764_AF-PAKI,1,10408764,C/T,T/C,T/C,BOT,forward,C,T,1,10408764
1_104453302_AF-PAKI,1,104453302,A/G,A/G,A/G,TOP,forward,G,A,1,104453302
1_107080965_AF-PAKI,1,107080965,A/G,A/G,A/G,TOP,forward,A,G,1,107080965
1_109839943_AF-PAKI,1,109839943,C/T,T/C,T/C,BOT,forward,C,T,1,109839943


Ok focus on the differences between my alignment and NCBI:

In [20]:
differences = smarter_results.query("chrom != smarter_chrom | position != smarter_position")
differences.head()

Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,smarter_chrom,smarter_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1_3570452_RH-map,0,0,,T/G,,BOT,,,,,0
1_63835619_AF-PAKI,0,0,,A/C,,TOP,,,,1.0,63835619
10_26570_AF-PAKI,0,0,,A/G,,TOP,,,,10.0,26570
10_4536087_RH-map,0,0,,A/G,,TOP,,,,10.0,4536087
10_4730976_RH-map,0,0,,A/C,,TOP,,,,10.0,4730976


In [21]:
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5555 entries, 1_3570452_RH-map to snp9857-scaffold1352-576383
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   chrom             5555 non-null   object
 1   position          5555 non-null   int64 
 2   alleles           2926 non-null   object
 3   illumina          5555 non-null   object
 4   illumina_forward  2926 non-null   object
 5   illumina_strand   5555 non-null   object
 6   strand            2926 non-null   object
 7   ref               2926 non-null   object
 8   alt               2926 non-null   object
 9   smarter_chrom     5555 non-null   object
 10  smarter_position  5555 non-null   int64 
dtypes: int64(2), object(9)
memory usage: 520.8+ KB


I have ~5500 differences between NCBI, some of them are on scaffolds. Let's focus on different chromosome types:

In [22]:
differences["chrom"].value_counts()

0                 2629
LWLT01000021.1    1090
LWLT01000027.1     843
7                   56
1                   55
                  ... 
scaffold_245         1
unplaced_18835       1
scaffold_51          1
scaffold_93          1
unplaced_7           1
Name: chrom, Length: 284, dtype: int64

Ok, tell me how many SNPs I can't place:

In [23]:
not_placed = differences[differences["chrom"] == '0']
print(f"There are {not_placed.shape[0]} SNPs that I can't map to genome")
not_placed.merge(errors["reason"], how="left", on="snp_name").sort_values("reason")

There are 2629 SNPs that I can't map to genome


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,smarter_chrom,smarter_position,reason
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
snp42591-scaffold566-479327,0,0,,T/C,,BOT,,,,14,18627872,Allele doesn't match to reference
QTLSaanen19.178,0,0,,A/G,,TOP,,,,19,28920263,Allele doesn't match to reference
QTLSaanen19.177_O,0,0,,T/C,,BOT,,,,19,28918313,Allele doesn't match to reference
QTLSaanen19.177,0,0,,T/C,,BOT,,,,19,28918313,Allele doesn't match to reference
QTLSaanen19.176_O,0,0,,C/G,,TOP,,,,19,28910602,Allele doesn't match to reference
...,...,...,...,...,...,...,...,...,...,...,...,...
snp30500-scaffold3364-56517,0,0,,A/G,,TOP,,,,0.NW_scaffold,23574,Too many alignments after filtering
snp3041-scaffold1092-9565,0,0,,A/G,,TOP,,,,18,61489143,Too many alignments after filtering
snp30303-scaffold333-3762473,0,0,,T/C,,BOT,,,,27,34812829,Too many alignments after filtering
snp30181-scaffold332-490100,0,0,,A/G,,TOP,,,,1,100639919,Too many alignments after filtering


Well, there are a lot of SNPs I cannot match. Group them by reason:

In [24]:
not_placed.merge(errors["reason"], how="left", on="snp_name")["reason"].value_counts()

Too many alignments after filtering       1182
No valid alignments after filtering        959
Allele doesn't match to reference          478
Cannot determine a unique SNP position      10
Name: reason, dtype: int64

Are then any *SNP* which I map to a different position?

In [25]:
different = differences.query("chrom != '0' and smarter_chrom != '0'")
print(f"There are {different.shape[0]} SNPs that I can map to a different position")
different

There are 2926 SNPs that I can map to a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,smarter_chrom,smarter_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
chr-Y-toconfirm-1,scaffold_43,11760,G/T,A/C,T/G,TOP,reverse,T,G,Y,11760
chr-Y-toconfirm-1_O,scaffold_43,11760,G/T,A/C,T/G,TOP,reverse,T,G,Y,11760
chr-Y-toconfirm-10,scaffold_14,280433,A/G,A/G,A/G,TOP,forward,G,A,Y,280433
chr-Y-toconfirm-10_O,scaffold_14,280433,A/G,A/G,A/G,TOP,forward,G,A,Y,280433
chr-Y-toconfirm-11,scaffold_25,43279,A/C,A/C,A/C,TOP,forward,A,C,Y,43279
...,...,...,...,...,...,...,...,...,...,...,...
snp9349-scaffold1340-278609,LWLT01000027.1,45533783,C/T,A/G,T/C,TOP,reverse,T,C,X,45533783
snp9350-scaffold1340-311407,LWLT01000027.1,45500958,A/C,T/G,A/C,BOT,reverse,C,A,X,45500958
snp9351-scaffold1340-356267,LWLT01000027.1,45457985,C/T,A/G,T/C,TOP,reverse,C,T,X,45457985
snp9687-scaffold1348-7500,unplaced_188,21157,G/T,A/C,T/G,TOP,reverse,T,G,0.NW_scaffold,21155


How many SNP I can map to a scaffold? Yes, they are a subset of the previous query:

In [26]:
scaffolds = differences[differences["chrom"].str.startswith('scaffold')]
query = scaffolds[scaffolds["smarter_chrom"] != '0']
print(f"There are {query.shape[0]} SNPs that I can map to a scaffold, while SMARTER map them in a different position")
query

There are 606 SNPs that I can map to a scaffold, while SMARTER map them in a different position


Unnamed: 0_level_0,chrom,position,alleles,illumina,illumina_forward,illumina_strand,strand,ref,alt,smarter_chrom,smarter_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
chr-Y-toconfirm-1,scaffold_43,11760,G/T,A/C,T/G,TOP,reverse,T,G,Y,11760
chr-Y-toconfirm-1_O,scaffold_43,11760,G/T,A/C,T/G,TOP,reverse,T,G,Y,11760
chr-Y-toconfirm-10,scaffold_14,280433,A/G,A/G,A/G,TOP,forward,G,A,Y,280433
chr-Y-toconfirm-10_O,scaffold_14,280433,A/G,A/G,A/G,TOP,forward,G,A,Y,280433
chr-Y-toconfirm-11,scaffold_25,43279,A/C,A/C,A/C,TOP,forward,A,C,Y,43279
...,...,...,...,...,...,...,...,...,...,...,...
snp8716-scaffold1310-23370,scaffold_37,37877,A/G,A/G,A/G,TOP,forward,A,G,0.NW_scaffold,37877
snp8812-scaffold1316-38259,scaffold_31,160485,A/G,T/C,A/G,BOT,reverse,G,A,0.NW_scaffold,160485
snp8813-scaffold1316-69235,scaffold_31,129542,A/G,T/C,A/G,BOT,reverse,G,A,0.NW_scaffold,129542
snp8814-scaffold1316-122968,scaffold_31,75704,G/T,A/C,T/G,TOP,reverse,G,T,0.NW_scaffold,75704
