# Guisandesa goats

Explore the latest dataset from Guisandesa goats

In [1]:
from pathlib import Path

from tqdm.notebook import tqdm
import pandas as pd

from src.features.smarterdb import global_connection, Dataset
from src.features.plinkio import TextPlinkIO, CodingException
from src.features.utils import get_interim_dir
from src.data.common import AssemblyConf

In [2]:
conn = global_connection()
ARS1 = AssemblyConf('ARS1','affymetrix')

class CustomMixin():
    n_of_individuals = None

    def process_pedfile(self, src_coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, src_coding)

        return True

    def is_top(self):
        try:
            return self.process_pedfile(src_coding='top')

        except CodingException:
            return False

    def is_forward(self):
        try:
            return self.process_pedfile(src_coding='forward')

        except CodingException:
            return False

    def is_affymetrix(self):
        try:
            return self.process_pedfile(src_coding='affymetrix')

        except CodingException:
            return False

class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass

Get the latest loaded dataset from Guisandesa goats:

In [3]:
guisandesa = Dataset.objects.get(file="Guisandesa.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(guisandesa.working_dir / "Guisandesa/Guisandesa Goat"),
    species=guisandesa.species,
    chip_name=guisandesa.chip_name)
plinkio.n_of_individuals = guisandesa.n_of_individuals

Start by reading coordinates. Try to determine how many SNPs I have in SMARTER database

In [4]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=ARS1,
    search_field="probeset_id",
    chip_name=guisandesa.chip_name
)

In [5]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 58854 of 59812 SNPs (1.6% missing)


Is this dataset in *top* coordinates?

In [6]:
plinkio.is_top()

  0%|          | 0/96 [00:00<?, ?it/s]

False

Is this file in *affymetrix forward* coordinates?

In [7]:
plinkio.is_affymetrix()

  0%|          | 0/96 [00:00<?, ?it/s]

True

The custom affymetrix chip uploaded into database seems to fit this genotype file. Ok now I need to create a metadata file for this dataset

In [8]:
metadata = {}
metadata["original_id"] = plinkio.get_samples()
metadata["latitude"] = [40.2149335] * len(metadata["original_id"])
metadata["longitude"] = [-5.1409339] * len(metadata["original_id"])

metadata = pd.DataFrame(metadata)
metadata.head()

Unnamed: 0,original_id,latitude,longitude
0,ES080002053909,40.214934,-5.140934
1,ES080002053911,40.214934,-5.140934
2,ES080002053912,40.214934,-5.140934
3,ES080002053913,40.214934,-5.140934
4,ES080004654590,40.214934,-5.140934


In [9]:
outfile = Path(guisandesa.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
metadata.to_excel(str(outpath), index=False)