# Spanish sheep 2
Processing the 2nd batch of data submitted by Spain partners:
* [20220326_resultados_SNP](#dataset0)
* [20220428_Smarter_Ovine](#dataset1)
* [20220503_Ovine](#dataset2)

In [1]:
import re
import os
import csv
import logging
import zipfile
from collections import defaultdict, Counter
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

from src.features.affymetrix import read_Manifest
from src.features.smarterdb import global_connection, Dataset
from src.features.plinkio import AffyPlinkIO, TextPlinkIO, CodingException
from src.features.utils import get_interim_dir, get_project_dir
from src.data.common import WORKING_ASSEMBLIES, AssemblyConf

_ = global_connection()
OAR3 = WORKING_ASSEMBLIES["OAR3"]
AFFY3 = AssemblyConf('Oar_v3.1','affymetrix')
logger = logging.getLogger('src.features.plinkio')
logger.setLevel(logging.CRITICAL)

In [2]:
class CustomMixin():
    n_of_individuals = None

    def process_pedfile(self, src_coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, src_coding)

        return True

    def is_top(self):
        try:
            return self.process_pedfile(src_coding='top')

        except CodingException:
            return False

    def is_forward(self):
        try:
            return self.process_pedfile(src_coding='forward')

        except CodingException:
            return False

    def is_affymetrix(self):
        try:
            return self.process_pedfile(src_coding='affymetrix')

        except CodingException:
            return False

class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

<a id='dataset0'></a>
## 20220326_resultados_SNP
This dataset is supposed to be affymetrix and to have both *Assaf* and *Castellana* breeds

In [3]:
dataset_20220326 = Dataset.objects.get(file="20220326_resultados_SNP.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(dataset_20220326.working_dir / "20220326_resultados_SNP/20220326_Ovine"),
    species=dataset_20220326.species,
    chip_name=dataset_20220326.chip_name)
plinkio.n_of_individuals = dataset_20220326.n_of_individuals

Start by reading coordinates. Try to determine how many SNPs I have in SMARTER database

In [4]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="probeset_id",
    chip_name=dataset_20220326.chip_name
)

In [5]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 49619 of 49702 SNPs (0.17% missing)


Is this file in *affymetrix forward* coordinates?

In [6]:
plinkio.is_affymetrix()

  0%|          | 0/96 [00:00<?, ?it/s]

True

Which breeds are currently in this dataset?

In [7]:
breeds_castellana = set()
samples_castellana = set()
for line in plinkio.read_pedfile():
    breed, sample = line[0], line[1]
    if breed not in breeds_castellana:
        breeds_castellana.add(breed)
    samples_castellana.add(sample)

print(f"Got {breeds_castellana} breeds")

Got {'SMARTER', 'Assaf'} breeds


<a id='dataset1'></a>
## 20220428_Smarter_Ovine
This dataset is supposed to be affymetrix and to have *Ojalada* breed

In [8]:
dataset_20220428 = Dataset.objects.get(file="20220428_Smarter_Ovine.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(dataset_20220428.working_dir / "20220428_Smarter_Ovine/20220428_Smarter_Ovine"),
    species=dataset_20220428.species,
    chip_name=dataset_20220428.chip_name)
plinkio.n_of_individuals = dataset_20220428.n_of_individuals

Start by reading coordinates. Try to determine how many SNPs I have in SMARTER database

In [9]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="probeset_id",
    chip_name=dataset_20220428.chip_name
)

In [10]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 49619 of 49702 SNPs (0.17% missing)


Is this file in *affymetrix forward* coordinates?

In [11]:
plinkio.is_affymetrix()

  0%|          | 0/92 [00:00<?, ?it/s]

True

Which breeds are currently in this dataset?

In [12]:
breeds_ojalada = set()
samples_ojalada = set()
for line in plinkio.read_pedfile():
    breed, sample = line[0], line[1]
    if breed not in breeds_ojalada:
        breeds_ojalada.add(breed)
    samples_ojalada.add(sample)

print(f"Got {breeds_ojalada} breeds")

Got {'Smarter'} breeds


<a id='dataset2'></a>
## 20220503_Ovine
This dataset is supposed to be affymetrix and to have *Assaf* breed

In [13]:
dataset_20220503 = Dataset.objects.get(file="20220503_Ovine.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(dataset_20220503.working_dir / "20220503_Ovine/20220503_Ovine"),
    species=dataset_20220503.species,
    chip_name=dataset_20220503.chip_name)
plinkio.n_of_individuals = dataset_20220503.n_of_individuals

Start by reading coordinates. Try to determine how many SNPs I have in SMARTER database

In [14]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="probeset_id",
    chip_name=dataset_20220503.chip_name
)

In [15]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 49619 of 49702 SNPs (0.17% missing)


Is this file in *affymetrix forward* coordinates?

In [16]:
plinkio.is_affymetrix()

  0%|          | 0/95 [00:00<?, ?it/s]

True

Which breeds are currently in this dataset?

In [17]:
breeds_assaf = set()
samples_assaf = set()
for line in plinkio.read_pedfile():
    breed, sample = line[0], line[1]
    if breed not in breeds_assaf:
        breeds_assaf.add(breed)
    samples_assaf.add(sample)

print(f"Got {breeds_assaf} breeds")

Got {'Assaf'} breeds
