# Spanish sheeps
Describe the latest data coming from Spain. They sent multiple file. File with prefix `CHUOJA` have the same samples for breeds `{'Churra', 'Ojalada'}` already imported from sheephapmap. Other files seems to have new samples, however they comes from an *affymetrix* array I don't have.
* [SMARTER-500-ASSAF](#dataset0)
* [Castellana](#dataset1)
* [Churra](#dataset2)

In [1]:
import re
import os
import csv
import logging
import zipfile
from collections import defaultdict, Counter
from pathlib import Path

import pandas as pd
from tqdm.notebook import tqdm

from src.features.affymetrix import read_Manifest
from src.features.smarterdb import global_connection, Dataset
from src.features.plinkio import AffyPlinkIO, TextPlinkIO, CodingException
from src.features.utils import get_interim_dir, get_project_dir
from src.data.common import WORKING_ASSEMBLIES, AssemblyConf

_ = global_connection()
OAR3 = WORKING_ASSEMBLIES["OAR3"]
AFFY3 = AssemblyConf('Oar_v3.1','affymetrix')
logger = logging.getLogger('src.features.plinkio')
logger.setLevel(logging.CRITICAL)

In [2]:
class CustomMixin():
    n_of_individuals = None
    
    def process_pedfile(self, coding="top"):
        for line in tqdm(self.read_pedfile(), total=self.n_of_individuals):
            _ = self._process_genotypes(line, coding)
            
        return True
    
    def is_top(self):
        try:
            return self.process_pedfile(coding='top')
        
        except CodingException:
            return False
    
    def is_forward(self):
        try:
            return self.process_pedfile(coding='forward')
        
        except CodingException:
            return False
        
    def is_affymetrix(self):
        try:
            return self.process_pedfile(coding='affymetrix')
        
        except CodingException:
            return False
        
class CustomTextPlinkIO(CustomMixin, TextPlinkIO):
    pass


class CustomAffyPlinkIO(CustomMixin, AffyPlinkIO):
    """This is not a cellfile, but a plink made by affymetrix"""
    
    def read_pedfile(self, *args, **kwargs):
        """Open pedfile for reading return iterator"""

        with open(self.pedfile) as handle:
            # affy files has both " " and "\t" in their files
            for record in handle:
                # affy data may have comments in files
                if record.startswith("#"):
                    logger.info(f"Skipping {record}")
                    continue

                line = re.split('[ \t]+', record.strip())

                yield line
                
    def update_pedfile(self, outputfile: str):
        with open(outputfile, "w") as target:
            writer = csv.writer(
                target, delimiter=' ', lineterminator="\n")

            processed = 0
            
            for line in self.read_pedfile():
                # a new line obj
                new_line = line.copy()
                
                # need to remove filtered snps from ped line
                for index in sorted(self.filtered, reverse=True):
                    # index is snp position. Need to delete two fields
                    del new_line[6+index*2+1]
                    del new_line[6+index*2]

                writer.writerow(new_line)
                processed += 1
                
        logger.info(f"Processed {processed} individuals")

<a id='dataset0'></a>
## SMARTER-500-ASSAF
Let'explore the ASSAF dataset. It seems to be an affymetrix dataset, however file is plink text format:

In [3]:
assaf_dataset = Dataset.objects.get(file="SMARTER-500-ASSAF.zip")
plinkio = CustomTextPlinkIO(
    prefix=str(assaf_dataset.working_dir / "SMARTER-500-ASSAF"), 
    species=assaf_dataset.species, 
    chip_name=assaf_dataset.chip_name)
plinkio.n_of_individuals = assaf_dataset.n_of_individuals

Start by reading coordinates. Try to determine how many SNPs I have in SMARTER database

In [4]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="probeset_id"
)

In [5]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 49589 of 49702 SNPs (0.23% missing)


Is this dataset in *top* coordinates?

In [6]:
plinkio.is_top()

  0%|          | 0/504 [00:00<?, ?it/s]

Error for SNP 4:AX-169033323: C/C <> A/G


False

Is this file in *affymetrix forward* coordinates?

In [7]:
plinkio.is_affymetrix()

  0%|          | 0/504 [00:00<?, ?it/s]

True

The custom affymetrix chip uploaded into database seems to fit this genotype file

<a id='dataset1'></a>
## Castellana_Ovine
Let's explore another spanish dataset. This dataset contains a plink file for the whole affymetrix chip and a subset of samples made to test for the creation of a smaller and cheaper chip. Samples and SNPs are the same, so the 10K dataset could be totally ignored. The 50K file is affymetrix plink file, however it don't come from *cell file* but its a plink *tab separated* file with comments

In [8]:
castellana_ovine = Dataset.objects.get(file="Castellana.zip")
plinkio = CustomAffyPlinkIO(
    prefix=str(castellana_ovine.working_dir / "Castellana/20220131 Ovine"), 
    species=castellana_ovine.species, 
    chip_name=castellana_ovine.chip_name)
plinkio.n_of_individuals = castellana_ovine.n_of_individuals

Start by reading coordinates. Try to determine how many SNPs I have in SMARTER database

In [9]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="probeset_id"
)

In [10]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 49589 of 49702 SNPs (0.23% missing)


Is this dataset in *top* coordinates?

In [11]:
plinkio.is_top()

  0%|          | 0/185 [00:00<?, ?it/s]

Error for SNP 4:AX-169033323: C/C <> A/G


False

Is this file in *affymetrix forward* coordinates?

In [12]:
plinkio.is_affymetrix()

  0%|          | 0/185 [00:00<?, ?it/s]

True

This is the same behaviour seen for *Assaf* file. Which breeds I have in this dataset?

In [13]:
breeds_castellana = set()
samples_castellana = set()
for line in plinkio.read_pedfile():
    breed, sample = line[0], line[1]
    if breed not in breeds_castellana:
        breeds_castellana.add(breed)
    samples_castellana.add(sample)
    
print(f"Got {breeds_castellana} breeds")

Got {'SMARTER', 'Assaf'} breeds


<a id='dataset2'></a>
## Churra
Let's explore the churra dataset. This dataset is affymetrix plink file, with mixed *rs_id* and *affy ids* as SNP names. We have about 60K SNPs instead of the 49K: does this dataset comes from a more recent manifest file?

In [14]:
churra_dataset = Dataset.objects.get(file="Churra.zip")
plinkio = CustomAffyPlinkIO(
    prefix=str(churra_dataset.working_dir / "Churra/Churra_SMARTER_JJsent"), 
    species=churra_dataset.species, 
    chip_name=churra_dataset.chip_name)
plinkio.n_of_individuals = churra_dataset.n_of_individuals

Here we have the problem that we have a mix of `rs_id` and `probeset_id` as snp names:

In [15]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="probeset_id"
)
probeset_found = len(plinkio.mapdata)-len(plinkio.filtered)
print(f"Found {probeset_found} SNPs using 'probeset_id'")

Found 14177 SNPs using 'probeset_id'


I can't find SNP using *affymetrix ids*. Let's use `rs_id` instead:

In [16]:
plinkio.read_mapfile()
plinkio.fetch_coordinates(
    src_assembly=AFFY3,
    search_field="rs_id"
)
rs_found = len(plinkio.mapdata)-len(plinkio.filtered)
print(f"Found {rs_found} SNPs using 'rs_id'")

Found 46159 SNPs using 'rs_id'


In [17]:
snps_found = probeset_found + rs_found
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing)")

I can retrieve 60336 of 60379 SNPs (0.07% missing)


Is this dataset in *top* coordinates?

In [18]:
plinkio.is_top()

  0%|          | 0/150 [00:00<?, ?it/s]

Error for SNP 5:rs409143979: G/G <> A/C


False

Is this file in *affymetrix forward* coordinates?

In [19]:
plinkio.is_affymetrix()

  0%|          | 0/150 [00:00<?, ?it/s]

Error for SNP 3245:rs430225014: C/C <> A/G


False

This is unexpected. Getting more info for `rs430225014` snp:

In [20]:
snp_locations = list(filter(lambda snp: snp[1].name == 'rs430225014', list(enumerate(plinkio.mapdata))))
snp_locations

[(3245, MapRecord(chrom='0', name='rs430225014', cm=0.0, position=0)),
 (55956, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)),
 (55957, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)),
 (55958, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)),
 (55959, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026))]

This snp is duplicated. This could be an issue while importing data. What about genotypes in those positions? get SNPs for first samples

In [21]:
line = next(plinkio.read_pedfile())
print(line[:2])

for location in snp_locations:
    i = location[0]
    a1 = line[6+i*2]
    a2 = line[6+i*2+1]

    genotype = [a1, a2]
    print(location, genotype)

['CHURRA', 'AV11111']
(3245, MapRecord(chrom='0', name='rs430225014', cm=0.0, position=0)) ['C', 'C']
(55956, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)) ['C', 'C']
(55957, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)) ['G', 'G']
(55958, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)) ['C', 'C']
(55959, MapRecord(chrom='20', name='rs430225014', cm=0.0, position=26705026)) ['T', 'T']


Unfortunately, for this sample there are different genotypes, I can't determine which probe I'm referring to. So lets try to determine how many snps are duplicated using rs_id. All of those records need to be removed from the dataset.

In [22]:
snp_ids = Counter()
for snp in plinkio.mapdata:
    snp_ids.update([snp.name])
    
duplicates = {x: count for x, count in snp_ids.items() if count > 1}
duplicates

{'rs401726527': 3,
 'rs430225014': 5,
 'rs161899045': 2,
 'rs428531658': 2,
 'rs411063068': 2,
 'rs425852623': 3,
 'rs406168362': 3,
 'rs402561655': 3,
 'rs426229273': 2,
 'rs405371865': 3,
 'rs426285862': 2,
 'rs403670129': 2,
 'rs419280018': 2,
 'rs416539062': 3,
 'rs414674284': 3,
 'rs418010992': 2,
 'rs419405934': 2,
 'rs427574198': 3,
 'rs417119249': 3,
 'rs412768618': 3,
 'rs405030483': 2,
 'rs423896391': 3,
 'rs427466865': 2,
 'rs420539536': 2,
 'rs161142328': 3,
 'rs405589666': 3,
 'rs403084223': 3,
 'rs420329233': 2,
 'rs416711628': 2,
 'rs161200133': 3,
 'rs424129466': 2,
 'rs406328119': 3,
 'rs423593214': 2}

Those are duplicated SNPs. Try to remove them from dataset (updating `filtered` by positions):

In [23]:
for idx, snp in enumerate(plinkio.mapdata):
    if snp.name in duplicates:
        plinkio.filtered.add(idx)

Test again for affimetrix coordinates:

In [24]:
plinkio.is_affymetrix()

  0%|          | 0/150 [00:00<?, ?it/s]

Error for SNP 16923:rs414355830: G/C <> T/G


False

Well, `rs414355830` is assigned to multiple probes, some of them are `G/C` and not `T/G` (as expected in `oar3_OAR2_3369768` illumina probe). One of this probe is tri-allelic in manifest file and is not imported. So, those SNPs should be manually removed. Try to read the manifest file and search for these three allelic SNPs

In [25]:
multi_allelic = []
for record in read_Manifest(get_project_dir() / "data/external/SHE/AFFYMETRIX/Axiom_BGovisNP_ovine_Annotation.r1.csv.gz"):
    if int(record.allele_count) > 2:
        if record.rsid:
            multi_allelic.append(record.rsid)

for idx, snp in enumerate(plinkio.mapdata):
    if snp.name in multi_allelic:
        plinkio.filtered.add(idx)

Skipping: ##For information about the Annotation file content, please see the bundled README file.
Skipping: #%guid=00005d49-3207-4b7a-118d-001bbd006ce2
Skipping: #%create_date=Tue Apr  4 11:40:20 2017
Skipping: #%chip_type=Axiom_BGovisNP
Skipping: #%genome-species=Axiom_BGovisNP_ovine.r1
Skipping: #%genome-version=N/A
Skipping: #%genome-version-ucsc=N/A
Skipping: #%genome-version-ncbi=N/A
Skipping: #%genome-version-create_date=N/A
Skipping: #%dbSNP_date=N/A
Skipping: #%dbSNP_version=N/A
Skipping: #%hapmap-date=N/A
Skipping: #%hapmap-version=N/A
Skipping: #%netaffx-annotation-date=N/A
Skipping: #%netaffx-annotation-netaffx-build=N/A
Skipping: #%netaffx-annotation-tabular-format-version=1.0
Skipping: #%netaffx-annotation-docgen-method=com.affymetrix.database.docgen.DocGenDriver
Skipping: #%netaffx-annotation-docgen-version=cluster_friendly_submission


Test again for affimetrix coordinates:

In [26]:
plinkio.is_affymetrix()

  0%|          | 0/150 [00:00<?, ?it/s]

True

This time, I can read my source file. However how many SNPs left after custom filtering?

In [27]:
snps_found = len(plinkio.mapdata)-len(plinkio.filtered)
perc_missing = round(100 - (snps_found / len(plinkio.mapdata) * 100), 2)

print(f"I can retrieve {snps_found} of {len(plinkio.mapdata)} SNPs ({perc_missing}% missing) after filtering")

I can retrieve 46068 of 60379 SNPs (23.7% missing) after filtering


More or less is the same percentage of the data I read at the beginning. I think I could write a new plink file with these filtering SNPs:

In [28]:
output_map = get_interim_dir() / "churra_fixed.map"
output_ped = get_interim_dir() / "churra_fixed.ped"
plinkio.update_mapfile(str(output_map))
plinkio.update_pedfile(str(output_ped))

Ok, let's try to create a metadata table in which defining the few GPS coordinates I have:

In [29]:
coordinates = {'AV': (42.097806, -5.283205), 'VG': (41.86830, -5.39687)}
data = defaultdict(list)
for line in plinkio.read_pedfile():
    # define the minimal set of smarter metadata
    data["original_id"].append(line[1])
    data["fid"].append(line[0])
    
    # this breed is already in smarter
    data["breed_name"].append("Churra")
    data["breed_code"].append("CHU")
    
    # other data I know
    data["country"].append("Spain")
    data["purpose"].append("Milk")
    
    # determining GPS coordinates
    key = line[1][:2]
    latlong = coordinates[key]
    data["latitude"].append(latlong[0])
    data["longitude"].append(latlong[1])
    
# ok transform into dataframe
df = pd.DataFrame(data=data)

In [30]:
outfile = Path(churra_dataset.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
df.to_excel(str(outpath), index=False)
os.chdir(get_interim_dir())
metadata_file = zipfile.ZipFile("Churra_metadata.zip", "w")
metadata_file.write(outfile, arcname=f"metadata/{outfile}")
outpath.unlink()
metadata_file.close()