# Uruguay Foreground data
Pablo sent new foreground data to the smarter repository. They come from affymetrix chip, however this file format is totally new, seems to be an affymetrix report. Check data file and try to guess how to import this new data format

In [1]:
import csv
import collections
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, Dataset, VariantSheep
from src.features.utils import sanitize, text_or_gzip_open
from src.features.affymetrix import skip_comments

In [2]:
_ = global_connection()

Some accessory functions:

In [3]:
def find_duplicates(header):
    """Find duplicate columns in header. Returns index to remove"""
    
    to_remove = []
    
    # count columns and find duplicates
    counts = collections.Counter(header)
    duplicated_cols = [key for key, value in counts.items() if value > 1]
    
    # now iterate and get duplicates indexes
    for duplicated in duplicated_cols:
        # get all duplicated index
        tmp = [i for i, col in enumerate(header) if col == duplicated]
        
        # track only from the 2nd occurrence
        to_remove += tmp[1:]
        
    return to_remove


def _search_in_header(header: list, term: str) -> list:
    return list(
        filter(
            lambda record: record.startswith(term),
            header
        )
    )


def search_n_samples(header: list) -> int:
    records = _search_in_header(header, "##samples-per-snp=")
    n_samples = None

    if records:
        n_samples = records[0].split("=")[1]
        n_samples = int(n_samples)

    return n_samples


def search_n_snp(header: list) -> int:
    records = _search_in_header(header, "##snp-count=")
    n_snp = None

    if records:
        n_snp = records[0].split("=")[1]
        n_snp = int(n_snp)

    return n_snp


def read_affymetrixRow(path: Path, delimiter="\t"):
    with text_or_gzip_open(path) as handle:
        position, skipped = skip_comments(handle)

        # go back to header section
        handle.seek(position)

        # now read csv file
        reader = csv.reader(handle, delimiter=delimiter)

        # get header
        header = next(reader)

        # sanitize column names
        header = [sanitize(column) for column in header]
        
        # ok try to get n of samples and snps
        n_samples = search_n_samples(skipped)
        n_snps = search_n_snp(skipped)
        
        # add data to header
        header.append("n_samples")
        header.append("n_snps")

        # find duplicated items
        to_remove = sorted(find_duplicates(header), reverse=True)

        # delete columns from header
        for index in to_remove:
            del header[index]

        # define a namedtuple istance
        Record = collections.namedtuple("Record", header)

        # get record and delete duplicate column
        for record in reader:
            # add records to data
            record.append(n_samples)
            record.append(n_snps)
            
            for index in to_remove:
                del record[index]

            record = Record._make(record)
            yield record

## Placa_Junio_recommended
Let's start from `Placa_Junio_recommended.zip` datafile:

In [4]:
placa_junio = Dataset.objects.get(file="Placa_Junio_recommended.zip")

In [5]:
path = placa_junio.working_dir / placa_junio.contents[0]
probeset_ids = [record.probeset_id for record in read_affymetrixRow(path)]

Skipping: ##batch-folder=X:\Resultados genotipados\ovinos INIA\2022\Analisis\OP925-969 1046-1085 1010-1020\Inia_Junio_RF_II
Skipping: ##annotation-file=C:\Users\Public\Documents\AxiomAnalysisSuite\Library\Axiom_Ovi_Can_ovine_Analysis.r5\Axiom_Ovi_Can.na35.r5.a6.annot.db
Skipping: ##export-txt-file=X:\Resultados genotipados\ovinos INIA\2022\Resultados\OP925-969 1046-1085 1010-1020\Placa_Junio_recommended.txt
Skipping: ##snp-count=49636
Skipping: ##samples-per-snp=96


Check if those probeset ids are in database:

In [6]:
missing = 0

for probeset_id in tqdm(probeset_ids, total=len(probeset_ids)):
    query = {
        "probesets__match": {
            'chip_name': "AffymetrixAxiomOviCan",
            'probeset_id': probeset_id
        }
    }
    
    if not VariantSheep.objects(**query):
        missing += 1
        
print(f"Missing {missing} SNPs of {len(probeset_ids)}")

  0%|          | 0/49636 [00:00<?, ?it/s]

Missing 1 SNPs of 49636


Ok, for what I see all the SNPs except one can be found in database. How about sample names?

In [7]:
record = next(read_affymetrixRow(path))
record.n_samples

Skipping: ##batch-folder=X:\Resultados genotipados\ovinos INIA\2022\Analisis\OP925-969 1046-1085 1010-1020\Inia_Junio_RF_II
Skipping: ##annotation-file=C:\Users\Public\Documents\AxiomAnalysisSuite\Library\Axiom_Ovi_Can_ovine_Analysis.r5\Axiom_Ovi_Can.na35.r5.a6.annot.db
Skipping: ##export-txt-file=X:\Resultados genotipados\ovinos INIA\2022\Resultados\OP925-969 1046-1085 1010-1020\Placa_Junio_recommended.txt
Skipping: ##snp-count=49636
Skipping: ##samples-per-snp=96


96

It seems that I have more samples than I have in metadata. Let's call metadata file and check if I could find those samples:

In [8]:
creole_metadata = Dataset.objects.get(file="20220809_105_Creole_Samples_INIA_Uruguay.zip")
with open(creole_metadata.working_dir / "20220809_105_Creole_Samples_INIA_Uruguay.xlsx", "rb") as handle:
    info = pd.read_excel(handle)
placa_junio_metadata = info[info["File"] == "Placa_Junio_recommended.txt"]
print(f"Got {placa_junio_metadata.shape[0]} samples")
placa_junio_metadata

Got 11 samples


Unnamed: 0,N,Lab_ID,Breed,Sex,Site,File
94,1,20220323182,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
95,2,20220323183,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
96,3,20220323184,Creole,Male,INIA Las Brujas,Placa_Junio_recommended.txt
97,4,20220323185,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
98,5,20220323186,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
99,6,20220323187,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
100,7,20220323188,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
101,8,20220323189,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt
102,9,20220323190,Creole,Male,INIA Las Brujas,Placa_Junio_recommended.txt
103,10,20220323191,Creole,Male,INIA Las Brujas,Placa_Junio_recommended.txt


Well, I have less *ids* than the samples I see in the datafile. Get *lab ids*:

In [9]:
lab_ids = [str(id_) for id_ in placa_junio_metadata["Lab_ID"].tolist()]
lab_ids

['20220323182',
 '20220323183',
 '20220323184',
 '20220323185',
 '20220323186',
 '20220323187',
 '20220323188',
 '20220323189',
 '20220323190',
 '20220323191',
 '20220323192']

And even samples names are different. I need to match the *file id* with the *lab id*:

In [10]:
lab2file_ids = collections.defaultdict(lambda: None)

for lab_id in lab_ids:
    for col in record._fields:
        if lab_id in col:
            lab2file_ids[lab_id] = col

Next, I need to track this column in my datatable:

In [11]:
info["id_column"] = info["Lab_ID"].apply(lambda lab_id: lab2file_ids[str(lab_id)])

Focus only on *placa junio* data:

In [12]:
placa_junio_metadata = info[info["File"] == "Placa_Junio_recommended.txt"]
placa_junio_metadata

Unnamed: 0,N,Lab_ID,Breed,Sex,Site,File,id_column
94,1,20220323182,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1010_20220323182_cel_call_code
95,2,20220323183,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1011_20220323183_cel_call_code
96,3,20220323184,Creole,Male,INIA Las Brujas,Placa_Junio_recommended.txt,op1012_20220323184_cel_call_code
97,4,20220323185,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1013_20220323185_cel_call_code
98,5,20220323186,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1014_20220323186_cel_call_code
99,6,20220323187,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1015_20220323187_cel_call_code
100,7,20220323188,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1016_20220323188_cel_call_code
101,8,20220323189,Creole,Female,INIA Las Brujas,Placa_Junio_recommended.txt,op1017_20220323189_cel_call_code
102,9,20220323190,Creole,Male,INIA Las Brujas,Placa_Junio_recommended.txt,op1018_20220323190_cel_call_code
103,10,20220323191,Creole,Male,INIA Las Brujas,Placa_Junio_recommended.txt,op1019_20220323191_cel_call_code
