# Check Sheep coordinates
## Check sheep coordinates with data provided by Sheep genome project
I received a series of `.csv` files with variants coordinates in the major assemblies version for *Sheep* and *Goat* genomes. I will check those coordinates with the coordinates I have to find potential mismatches
### Working on OAR3

Define some stuff before checking coordinates

In [1]:
import csv
import json
import pprint
import random
import pandas

from collections import namedtuple, defaultdict
from ensemblrest import EnsemblRest

from src.data.common import WORKING_ASSEMBLIES
from src.features.utils import get_project_dir, text_or_gzip_open
from src.features.smarterdb import global_connection, VariantSheep

# define ensemblrest helper
ensRest = EnsemblRest()

# define the source data file
data_dir = get_project_dir() / "data/external/SHE/CONSORTIUM"
ovine_snp50_oar3 = data_dir / "OvineSNP50_B.csv_v3.1_pos_20190513.csv.gz"

# make a connection to smarter database
global_connection()

# get the smarter assembly I want to test
OAR3 = WORKING_ASSEMBLIES['OAR3']

Let's define some helper functions:

In [2]:
def check_coordinates(record, variant):
    check = True
    location = variant.locations[0]
    
    if record.chrom != location.chrom:
        check = False
        
    if record.pos != location.position:
        check = False
    
    # alleles are illumina readed values
    # if record.alleles != location.illumina_forward:
    #     check = False
        
    return check


def search_mismatches(datafile, smarter_assembly):    
    # track mismatches in positions
    mismatches = []

    # process data records
    with text_or_gzip_open(datafile) as handle:
        reader = csv.reader(handle, delimiter=",")
        header = next(reader)
        Record = namedtuple("Record", header)
        for line in reader:
            # fix position column
            idx = header.index('pos')
            line[idx] = int(line[idx])

            # fix allele format
            idx = header.index('alleles')
            line[idx] = "/".join(list(line[idx]))

            # make a record from csv line
            record = Record._make(line)

            # get variant and chosen location from database
            variant = VariantSheep.objects(name=record.entry).fields(pk=0, rs_id=1, name=1, elemMatch__locations=smarter_assembly._asdict()).get()

            # check data
            if not check_coordinates(record, variant):
                # print(f"Variant is different: {record} <> {variant}, {variant.locations[0]}")
                mismatches.append([record, variant])

    # print(record)
    # pprint.pprint(json.loads(variant.to_json()))
    print(f"There are {len(mismatches)} mismatches")
    
    return mismatches

Now check provided coordinates with the coordinates stored in database:

In [3]:
mismatches_oar3 = search_mismatches(ovine_snp50_oar3, OAR3)

There are 6036 mismatches


Ok, let's take some random mismatch locations, and try to get informations from ensembl:

In [4]:
random.seed(a=42, version=2)
selected_oar3 = random.sample(mismatches_oar3, 20)

# get rs_id from selected items
selected_rsID = [variant.rs_id for (record, variant) in selected_oar3 if variant.rs_id]

# get informations from ensembl
result_oar3 = ensRest.getVariationByMultipleIds(ids=selected_rsID, species="ovis_aries")

Now re-arrange results in a `pandas.Dataframe`:

In [5]:
def to_pd_table(selected, result):
    data = defaultdict(list)

    for record, variant in selected:
        data["entry"].append(record.entry)
        data["chrom"].append(record.chrom)
        data["pos"].append(record.pos)
        data["rs_id"].append(variant.rs_id)
        data["smarter_chrom"].append(variant.locations[0].chrom)
        data["smarter_pos"].append(variant.locations[0].position)

        if variant.rs_id:
            data["ensembl_chrom"].append(result[variant.rs_id]['mappings'][0]['seq_region_name'])
            data["ensembl_pos"].append(result[variant.rs_id]['mappings'][0]['start'])

        else:
            data["ensembl_chrom"].append(None)
            data["ensembl_pos"].append(None)

    return pandas.DataFrame.from_dict(data)

to_pd_table(selected_oar3, result_oar3)

Unnamed: 0,entry,chrom,pos,rs_id,smarter_chrom,smarter_pos,ensembl_chrom,ensembl_pos
0,s75780.1,27,54216498,rs161681000,X,54216498,X,54216498.0
1,OAR2_192839652.1,2,181989267,,0,0,,
2,s45548.1,1,105291839,rs428119865,0,0,1,105291839.0
3,OAR7_95892509.1,7,88177594,,0,0,,
4,OAR6_91684840.1,6,83902199,,0,0,,
5,OAR5_100812608.1,5,92563424,rs423024283,0,0,5,92563424.0
6,OAR3_68238991.1,3,64585315,rs430484597,0,0,3,64585315.0
7,OARX_95154058.1,27,135106137,rs401042801,X,135106137,X,135106137.0
8,OAR2_159324874.1,2,150138196,rs400029026,0,0,2,150138196.0
9,OARX_114100188.1,27,92387372,rs422929969,X,92387372,X,92387372.0


## Working with OAR4 coordinates
Now let's check the latest coordinates version

In [6]:
# define the working data file
ovine_snp50_oar4 = data_dir / "OvineSNP50_B.csvv4.0_pos_20190513.csv.gz"

# get the smarter assembly I want to test
OAR4 = WORKING_ASSEMBLIES['OAR4']

Search mismatches in *OAR4* assembly

In [7]:
mismatches_oar4 = search_mismatches(ovine_snp50_oar4, OAR4)

There are 6048 mismatches


In [8]:
random.seed(a=42, version=2)
selected_oar4 = random.sample(mismatches_oar4, 20)

In [9]:
data = defaultdict(list)

for record, variant in selected_oar4:
    data["entry"].append(record.entry)
    data["chrom"].append(record.chrom)
    data["pos"].append(record.pos)
    data["rs_id"].append(variant.rs_id)
    data["smarter_chrom"].append(variant.locations[0].chrom)
    data["smarter_pos"].append(variant.locations[0].position)

pandas.DataFrame.from_dict(data)

Unnamed: 0,entry,chrom,pos,rs_id,smarter_chrom,smarter_pos
0,s44901.1,27,53433728,rs405378494,X,53433728
1,OAR2_192378013.1,2,181554086,rs425938300,0,0
2,s31596.1,1,104766281,,0,0
3,s53156.1,7,86529139,rs415532834,0,0
4,OAR6_90124627.1,6,82235250,rs414248683,0,0
5,OAR5_100348132.1,5,92019418,,0,0
6,OAR3_67623052.1,3,63888691,rs420144331,0,0
7,OARX_94177459_X.1,27,134081476,rs408436026,X,134081476
8,s37790.1,2,148410633,,0,0
9,OARX_110992791.1,27,91633123,rs416320037,X,91633123
