# Uruguay Second Upload (Foreground)
Pablo set new data belonging created for other WPs

In [1]:
import csv
import collections
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm

from src.features.smarterdb import global_connection, Dataset, VariantSheep
from src.features.utils import skip_comments, text_or_gzip_open, get_interim_dir
from src.features.affymetrix import read_affymetrixRow
from src.features.plinkio import AffyReportIO

In [2]:
def get_header(report):
        # sample names are sanitized through read_affymetrixRow: so read the
        # first header of the report file to determine the original sample
        # names
        with text_or_gzip_open(report) as handle:
            position, skipped = skip_comments(handle)

            # go back to header section
            handle.seek(position)

            # now read csv file
            reader = csv.reader(handle, delimiter="\t")

            # get header
            return next(reader)

In [3]:
_ = global_connection()

## Inia_junio_2021_Texel_46_20210409_SMARTER
lets start with `Inia_junio_2021_Texel_46_20210409_SMARTER.zip`

In [4]:
inia_20210409 = Dataset.objects.get(file="Inia_junio_2021_Texel_46_20210409_SMARTER.zip")

There's only one file in dataset, the file with the data:

In [5]:
path = inia_20210409.working_dir / inia_20210409.contents[0]
probeset_ids = [record.probeset_id for record in read_affymetrixRow(path)]

Check if those probeset ids are in database:

In [6]:
missing = 0

for probeset_id in tqdm(probeset_ids, total=len(probeset_ids)):
    query = {
        "probesets__match": {
            'chip_name': inia_20210409.chip_name,
            'probeset_id': probeset_id
        }
    }
    
    if not VariantSheep.objects(**query):
        missing += 1
        
print(f"Missing {missing} SNPs of {len(probeset_ids)}")

  0%|          | 0/40204 [00:00<?, ?it/s]

Missing 0 SNPs of 40204


I have all the SNPs in my database. Check for sample names:

In [7]:
record = next(read_affymetrixRow(path))
print(f"{record.n_samples} reported in file")
print(f"dataset has {inia_20210409.n_of_individuals} samples")
samples = list(filter(lambda name: 'cel_call_code' in name, get_header(path)))
print(f"I could find only {len(samples)} samples in report file")

81 reported in file
dataset has 46 samples
I could find only 38 samples in report file


Well, even if the report tells that there are `81` samples, dataset has `46` samples and I could find only `38` samples in file. Try to force reading report with custom number of samples:

In [8]:
report = AffyReportIO(report=path)
report.read_reportfile(n_samples=len(samples))

Ok, try to get metadata and understand which samples I miss

In [9]:
metadata_dataset = Dataset.objects.get(file="INIA_other_WPs_metadata.zip")
metadata_dataset.contents

['20210409_Genexa.xlsx',
 '20210824_Genexa.xlsx',
 '20211110_Genexa.xlsx',
 '20220301_Genexa.xlsx',
 '20220323_Genexa.xlsx',
 '20220810_Genexa.xlsx']

In [10]:
metadata_path = metadata_dataset.working_dir / "20210409_Genexa.xlsx"
with open(metadata_path, "rb") as handle:
    inia_20210409_metadata = pd.read_excel(handle)
inia_20210409_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   N                46 non-null     int64 
 1   ID               46 non-null     int64 
 2   Breed            46 non-null     object
 3   Sex              46 non-null     object
 4   Stall            46 non-null     object
 5   GPS_Coordinates  46 non-null     object
 6   GPS_2            46 non-null     object
dtypes: int64(2), object(5)
memory usage: 2.6+ KB


In [11]:
ids = [str(id_) for id_ in inia_20210409_metadata["ID"].tolist()]

And even samples names are different. I need to match the *file id* with the *lab id*:

In [12]:
name2id = collections.defaultdict(lambda: None)

for id_ in ids:
    for col in samples:
        if id_ in col:
            name2id[id_] = col

Next, I need to track this column in my datatable:

In [13]:
inia_20210409_metadata["alias"] = inia_20210409_metadata["ID"].apply(lambda id_: name2id[str(id_)])

Now try to get rows with missing alias:

In [14]:
inia_20210409_metadata[inia_20210409_metadata["alias"].isnull()].drop(["GPS_2", "GPS_Coordinates"], axis=1)

Unnamed: 0,N,ID,Breed,Sex,Stall,alias
1,9,20210409009,Texel,Male,INIA Las Brujas,
2,10,20210409010,Texel,Male,INIA Las Brujas,
14,65,20210409065,Texel,Male,INIA Las Brujas,
22,73,20210409073,Texel,Male,INIA Las Brujas,
23,74,20210409074,Texel,Male,INIA Las Brujas,
30,81,20210409081,Texel,Female,INIA Las Brujas,
38,89,20210409089,Texel,Male,INIA Las Brujas,
39,90,20210409090,Texel,Female,INIA Las Brujas,


Ok there are some missing samples. Try to split coordinate columns in order to be imported into database:

In [15]:
inia_20210409_metadata["latitude"] = inia_20210409_metadata["GPS_Coordinates"].apply(lambda string: float(string.split(",")[0].strip()))
inia_20210409_metadata["longitude"] = inia_20210409_metadata["GPS_Coordinates"].apply(lambda string: float(string.split(",")[1].strip()))

Ok, write them into a file:

In [16]:
inia_20210409_metadata.to_excel("20210409_Genexa_fix.xlsx", index=False)