# dbSNP152 vs SNPchiMpv3
Since I've uploaded data from the latest dbSNP backup and I'm using this data to compare my alignments, I want to see if this data is similar to SNPchiMp data, which is the reference I use to provide genotypes with updated coordinates:

In [1]:
import os
import pymongo
import pandas as pd

from dotenv import find_dotenv, load_dotenv
from pymongo import MongoClient
from pymongoarrow.monkey import patch_all
from pymongoarrow.api import Schema

In [2]:
load_dotenv(find_dotenv())
patch_all()

In [3]:
conn = MongoClient(
    'mongodb://localhost:27017/',
    username=os.getenv("MONGODB_SMARTER_USER"),
    password=os.getenv("MONGODB_SMARTER_PASS")
)
smarter = conn['smarter']

Now define a *MongoDB* pipeline which collect and transform data in the simplest way:

In [4]:
def get_pipeline(imported_from: str, version: str, columns: list[str]):
    return [
        # match the SNPs I want
        {"$match": {
            "locations": {"$elemMatch": {"version": version, "imported_from": imported_from}}
        }},
        # now limit the fields I need
        {"$project": {
            "snp_name": "$name",
            # this will join a list of strings, like ",".join(list)
            "rs_id": {
                "$reduce": {
                    "input": "$rs_id", 
                    "initialValue": "", 
                    "in": {
                        "$concat": [
                            "$$value", 
                            {'$cond': [{'$eq': ['$$value', '']}, '', ', ']}, 
                            "$$this"
                        ]
                    }
                }
            },
            # this is how to do an $elemMatch in a projection step of a pipeline
            "locations": {
                "$filter": {
                    "input": "$locations", 
                    "as": "location", 
                    "cond": {
                        "$and": [
                            {"$eq": ["$$location.imported_from", imported_from]}, 
                            {"$eq": ["$$location.version", version]}
                        ]
                    }
                }
            }
        }},
        # attempt to simplify locations, get a row for each item of array (unpack the only item)
        {"$unwind": "$locations"}, 
        # track the fields I'm interested
        {"$set": {
            columns[0]: "$locations.chrom", 
            columns[1]: "$locations.position"
        }},
        # remove the field I don't want
        {"$unset": "locations"}
    ]

## Sheep
Here I execute the aggregation pipeline and set index like I did for other chips:

In [5]:
variantSheep = smarter['variantSheep']

In [6]:
pipeline = get_pipeline(imported_from="dbSNP152", version="Oar_v4.0", columns=["ncbi_chrom", "ncbi_position"])
schema = Schema({"snp_name": str, "rs_id": str, "ncbi_chrom": str, "ncbi_position": int})

ncbi_locations = variantSheep.aggregate_pandas_all(pipeline, schema=schema)
ncbi_locations.set_index('snp_name', inplace=True)
ncbi_locations.head()

Unnamed: 0_level_0,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
250506CS3900065000002_1238.1,rs55630613,15,5859890
250506CS3900140500001_312.1,rs55630642,23,26243215
250506CS3900176800001_906.1,rs55630654,7,81590897
250506CS3900211600001_1041.1,rs55630658,16,41363310
250506CS3900218700001_1294.1,rs55630663,2,148834939


Ok, time to collect data but for SNPchiMp coordinates:

In [7]:
pipeline = get_pipeline(imported_from="SNPchiMp v.3", version="Oar_v4.0", columns=["snpchimp_chrom", "snpchimp_position"])
schema = Schema({"snp_name": str, "rs_id": str, "snpchimp_chrom": str, "snpchimp_position": int})

snpchimp_locations = variantSheep.aggregate_pandas_all(pipeline, schema=schema)
snpchimp_locations.set_index('snp_name', inplace=True)
snpchimp_locations.head()

Unnamed: 0_level_0,rs_id,snpchimp_chrom,snpchimp_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
250506CS3900065000002_1238.1,rs55630613,15,5859890
250506CS3900140500001_312.1,rs55630642,23,26243215
250506CS3900176800001_906.1,rs55630654,7,81590897
250506CS3900211600001_1041.1,rs55630658,16,41363310
250506CS3900218700001_1294.1,rs55630663,2,148834939


Now join the two tables relying on snp_name. A inner join will do what I need

In [8]:
merged = snpchimp_locations.merge(ncbi_locations, on='snp_name', how='inner')
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 610849 entries, 250506CS3900065000002_1238.1 to oar3_scaffold008050_3569
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   rs_id_x            610849 non-null  object
 1   snpchimp_chrom     610849 non-null  object
 2   snpchimp_position  610849 non-null  int64 
 3   rs_id_y            610849 non-null  object
 4   ncbi_chrom         610849 non-null  object
 5   ncbi_position      610849 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 32.6+ MB


In [9]:
merged.head()

Unnamed: 0_level_0,rs_id_x,snpchimp_chrom,snpchimp_position,rs_id_y,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
250506CS3900065000002_1238.1,rs55630613,15,5859890,rs55630613,15,5859890
250506CS3900140500001_312.1,rs55630642,23,26243215,rs55630642,23,26243215
250506CS3900176800001_906.1,rs55630654,7,81590897,rs55630654,7,81590897
250506CS3900211600001_1041.1,rs55630658,16,41363310,rs55630658,16,41363310
250506CS3900218700001_1294.1,rs55630663,2,148834939,rs55630663,2,148834939


Ok focus on the differences between SNPchiMp and NCBI:

In [10]:
differences = merged.query("snpchimp_chrom != ncbi_chrom | snpchimp_position != ncbi_position")
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81 entries, DU443720_334.1 to oar3_scaffold006612_5276
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   rs_id_x            81 non-null     object
 1   snpchimp_chrom     81 non-null     object
 2   snpchimp_position  81 non-null     int64 
 3   rs_id_y            81 non-null     object
 4   ncbi_chrom         81 non-null     object
 5   ncbi_position      81 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 4.4+ KB


In [11]:
differences.head()

Unnamed: 0_level_0,rs_id_x,snpchimp_chrom,snpchimp_position,rs_id_y,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DU443720_334.1,rs411374027,24,41255643,rs411374027,0,0
OAR1_116586605.1,rs405276567,1,107625664,rs405276567,0,0
OAR1_133336476.1,rs415844707,1,122909689,rs415844707,0,0
OAR1_178163941.1,rs419744835,1,164945412,rs419744835,0,0
OAR1_211799990.1,rs409415213,1,195993950,rs409415213,0,0


In [12]:
differences["ncbi_chrom"].value_counts()

0    81
Name: ncbi_chrom, dtype: int64

So, I saw 81 differences which are all on chromosome 0 for NCBI (maybe they have been removed from alignments). I'm pretty confident about my dbSNP import

## Goat
doing the same stuff, but for Goats:

In [13]:
variantGoat = smarter['variantGoat']

In [14]:
pipeline = get_pipeline(imported_from="dbSNP152", version="CHI1.0", columns=["ncbi_chrom", "ncbi_position"])
schema = Schema({"snp_name": str, "rs_id": str, "ncbi_chrom": str, "ncbi_position": int})

ncbi_locations = variantGoat.aggregate_pandas_all(pipeline, schema=schema)
ncbi_locations.set_index('snp_name', inplace=True)
ncbi_locations.head()

Unnamed: 0_level_0,rs_id,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
snp1-scaffold1-2170,rs268233143,22,27222753
snp1-scaffold708-1421224,rs268293133,14,90885671
snp10-scaffold1-352655,rs268233152,22,26872268
snp1000-scaffold1026-533890,rs268291433,8,68958341
snp10000-scaffold1356-652219,rs268242876,7,50027003


Ok, time to collect data but for SNPchiMp coordinates:

In [15]:
pipeline = get_pipeline(imported_from="SNPchiMp v.3", version="CHI1.0", columns=["snpchimp_chrom", "snpchimp_position"])
schema = Schema({"snp_name": str, "rs_id": str, "snpchimp_chrom": str, "snpchimp_position": int})

snpchimp_locations = variantGoat.aggregate_pandas_all(pipeline, schema=schema)
snpchimp_locations.set_index('snp_name', inplace=True)
snpchimp_locations.head()

Unnamed: 0_level_0,rs_id,snpchimp_chrom,snpchimp_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
snp1-scaffold1-2170,rs268233143,22,27222753
snp1-scaffold708-1421224,rs268293133,14,90885671
snp10-scaffold1-352655,rs268233152,22,26872268
snp1000-scaffold1026-533890,rs268291433,8,68958341
snp10000-scaffold1356-652219,rs268242876,7,50027003


Now join the two tables relying on snp_name. A inner join will do what I need

In [16]:
merged = snpchimp_locations.merge(ncbi_locations, on='snp_name', how='inner')
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53347 entries, snp1-scaffold1-2170 to snp9999-scaffold1356-622362
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   rs_id_x            53347 non-null  object
 1   snpchimp_chrom     53347 non-null  object
 2   snpchimp_position  53347 non-null  int64 
 3   rs_id_y            53347 non-null  object
 4   ncbi_chrom         53347 non-null  object
 5   ncbi_position      53347 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 2.8+ MB


In [17]:
merged.head()

Unnamed: 0_level_0,rs_id_x,snpchimp_chrom,snpchimp_position,rs_id_y,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
snp1-scaffold1-2170,rs268233143,22,27222753,rs268233143,22,27222753
snp1-scaffold708-1421224,rs268293133,14,90885671,rs268293133,14,90885671
snp10-scaffold1-352655,rs268233152,22,26872268,rs268233152,22,26872268
snp1000-scaffold1026-533890,rs268291433,8,68958341,rs268291433,8,68958341
snp10000-scaffold1356-652219,rs268242876,7,50027003,rs268242876,7,50027003


Ok focus on the differences between SNPchiMp and NCBI:

In [18]:
differences = merged.query("snpchimp_chrom != ncbi_chrom | snpchimp_position != ncbi_position")
differences.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141 entries, snp10499-scaffold1375-13515 to snp8119-scaffold1297-229804
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   rs_id_x            141 non-null    object
 1   snpchimp_chrom     141 non-null    object
 2   snpchimp_position  141 non-null    int64 
 3   rs_id_y            141 non-null    object
 4   ncbi_chrom         141 non-null    object
 5   ncbi_position      141 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 7.7+ KB


In [19]:
differences.head()

Unnamed: 0_level_0,rs_id_x,snpchimp_chrom,snpchimp_position,rs_id_y,ncbi_chrom,ncbi_position
snp_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
snp10499-scaffold1375-13515,rs268243362,4,97644567,rs268243362,0,0
snp11387-scaffold1412-819858,rs268244224,20,64931642,rs268244224,9,4980459
snp11795-scaffold1437-634114,rs268244618,1,27102919,rs268244618,0,0
snp12056-scaffold1445-162482,rs268244874,X,85504653,rs268244874,0,0
snp12121-scaffold1448-847838,rs268244930,17,61881296,rs268244930,0,0


In [20]:
differences["ncbi_chrom"].value_counts()

0     111
6       3
2       3
20      2
18      2
7       2
X       2
12      2
9       2
1       2
15      1
27      1
11      1
5       1
13      1
23      1
28      1
3       1
16      1
4       1
Name: ncbi_chrom, dtype: int64

Well, on goats there are a little bit differences, however my snpchimp data is not so different from NCBI data