# Exploring greek data
* [AUTH_OVN50KV2_CHI_FRI](#dataset0)
* [AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA](#dataset1)
* [AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO](#dataset2)
* [AUTH_OVN50KV2_CHI_BOU_MYT_FRI](#dataset4)
* [AUTH_GOAT53KV1_EGHORIA_SKOPELOS](#dataset3)

In [1]:
import os
import re
import csv
import zipfile
import shutil
from pathlib import Path

import pandas as pd
import numpy as np
from plinkio import plinkfile

from src.features.smarterdb import global_connection, Dataset
from src.features.utils import get_raw_dir, get_interim_dir

_ = global_connection()

Read the full metadata file

In [2]:
sheep_phenotype_dataset = Dataset.objects(file="greece_foreground_sheep.zip").get()
metadata_file = sheep_phenotype_dataset.working_dir / "greece_foreground_sheep/greece_foreground_metadata_fix.xlsx"
original_file = sheep_phenotype_dataset.working_dir / "greece_foreground_sheep/greece_foreground_metadata.xlsx"
sheep_dataset = pd.read_excel(metadata_file, sheet_name="sheep")

Read *Farm coding* column in a more useful way:

In [3]:
pattern1 = re.compile(r'([\D]+)([\d]+)-([\D]+)([\d]+)')

def custom_split(val):
    val = [el.strip() for el in val.split(",")]
    match = re.search(pattern1, val[0])
    if match:
        try:
            code, start, _, stop = match.groups()
        except ValueError as exc:
            print(match.groups(), val)
            raise exc
        val = [code, (start, stop)]
    return val

sheep_dataset["parsed_coding"] = sheep_dataset["Farm Coding"].apply(custom_split)

Get the greek foreground datasets for *Sheep*

In [4]:
datasets = Dataset.objects.filter(country="Greece", type___all=["foreground", "genotypes"], species="Sheep")

Prepare a phenotype archive to put all metadata information (for Sheep):

In [5]:
os.chdir(get_interim_dir())
phenotype_file = zipfile.ZipFile("greece_foreground_sheep.zip", "w")

Append the fixed and original metadata to the archive

In [6]:
phenotype_file.write(metadata_file, arcname="greece_foreground_sheep/greece_foreground_metadata_fix.xlsx")
phenotype_file.write(original_file, arcname="greece_foreground_sheep/greece_foreground_metadata.xlsx")

<a id='dataset0'></a>
## AUTH_OVN50KV2_CHI_FRI
ok start from the first dataset I have:

In [7]:
dataset = datasets[0]
print(dataset)

file=AUTH_OVN50KV2_CHIOS_FRIZARTA.zip, uploader=AUTH


Open plink file

In [8]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_OVN50KV2_CHIOS_FRIZARTA/AUTH_OVN50KV2_CHI_FRI") )

Read samples name and their breed code from *ped*. Create a *dataframe*

In [9]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [10]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

Now create a function able to retrive a metadata row relying on sample name:

In [11]:
pattern2 = re.compile(r'([\D]+)([\d]+)')

def get_metadata_row(breed_code, sample_name, df=sheep_dataset):    
    match = re.search(pattern2, sample_name)
    farm_coding, number = match.groups()
    
    def in_between(val):
        if len(val) == 2 and type(val[1]) is tuple:
            start, stop = int(val[1][0]), int(val[1][1])
            
            if number >= start and number <= stop:
                return True
            
        return False
    
    if breed_code in ['BOU']:
        number = int(number)
        df["selected"] = df["parsed_coding"].apply(in_between)
        
    else:
        df["selected"] = df["parsed_coding"].apply(lambda val: farm_coding in val)
    
    try:
        index = df[(df['Code'] == breed_code) & (df['selected'] == True)].index.values[0]
        return index
    except IndexError as exc:
        print(breed_code, sample_name, farm_coding)
        raise exc

Apply such function to samples dataframe and create a new column with the metadata index row:

In [12]:
# samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])
samples["metadata_idx"] = samples[['breed_code', 'sample_name']].apply(lambda df: get_metadata_row(df['breed_code'], df['sample_name']), axis=1)

Now drop unused columns and save a metadata file in *working directory*:

In [13]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = Path(dataset.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
merged.to_excel(str(outpath), index=False)

Add this file to phenotype archive:

In [14]:
phenotype_file.write(outfile, arcname=f"greece_foreground_sheep/{outfile}")
outpath.unlink()

<a id='dataset1'></a>
## AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA

Same stuff for the 2nd dataset:

In [15]:
dataset = datasets[1]
print(dataset)

file=AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA.zip, uploader=AUTH


In [16]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA/AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA") )

In [17]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [18]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [19]:
# samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])
samples["metadata_idx"] = samples[['breed_code', 'sample_name']].apply(lambda df: get_metadata_row(df['breed_code'], df['sample_name']), axis=1)

In [20]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = Path(dataset.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
merged.to_excel(str(outpath), index=False)

Add this file to phenotype archive:

In [21]:
phenotype_file.write(outfile, arcname=f"greece_foreground_sheep/{outfile}")
outpath.unlink()

<a id='dataset2'></a>
## AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO

And now the third dataset:

In [22]:
dataset = datasets[2]
print(dataset)

file=AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO.zip, uploader=AUTH


This dataset has SNPs with extra chroms. Open with `plinkio` custom version:

In [23]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO/AUTH_OVN50KV02_CHI_MYT_BOU") )

In [24]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [25]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [26]:
# samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])
samples["metadata_idx"] = samples[['breed_code', 'sample_name']].apply(lambda df: get_metadata_row(df['breed_code'], df['sample_name']), axis=1)

In [27]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = Path(dataset.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
merged.to_excel(str(outpath), index=False)

Add this file to phenotype archive:

In [28]:
phenotype_file.write(outfile, arcname=f"greece_foreground_sheep/{outfile}")
outpath.unlink()

<a id='dataset4'></a>
## AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO

In [29]:
dataset = datasets[3]
print(dataset)

file=AUTH_OVN50KV2_CHI_BOU_MYT_FRI.zip, uploader=AUTH


In [30]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_OVN50KV2_CHI_BOU_MYT_FRI/Aristotle_University_OVN50KV02_20211108") )

In [31]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [32]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [33]:
# samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])
samples["metadata_idx"] = samples[['breed_code', 'sample_name']].apply(lambda df: get_metadata_row(df['breed_code'], df['sample_name']), axis=1)

In [34]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = Path(dataset.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
merged.to_excel(str(outpath), index=False)

In [35]:
phenotype_file.write(outfile, arcname=f"greece_foreground_sheep/{outfile}")
outpath.unlink()

Closing archive file for sheep

In [36]:
phenotype_file.close()

<a id='dataset3'></a>
## AUTH_GOAT53KV1_EGHORIA_SKOPELOS
Goat belong to a different dataset

In [37]:
dataset = Dataset.objects.get(country="Greece", type___all=["foreground", "genotypes"], species="Goat")

Read goat metadata:

In [38]:
goat_phenotype_dataset = Dataset.objects(file="greece_foreground_goat.zip").get()
metadata_file = goat_phenotype_dataset.working_dir / "greece_foreground_goat/greece_foreground_metadata_fix.xlsx"
original_file = goat_phenotype_dataset.working_dir / "greece_foreground_goat/greece_foreground_metadata.xlsx"
goat_dataset = pd.read_excel(metadata_file, sheet_name="goat")

In [39]:
goat_dataset["parsed_coding"] = goat_dataset["Farm Coding"].apply(custom_split)

Prepare an archive for goat:

In [40]:
os.chdir(get_interim_dir())
phenotype_file = zipfile.ZipFile("greece_foreground_goat.zip", "w")

In [41]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_GOAT53KV1_EGHORIA_SKOPELOS/AUTH_GOAT53KV1_SKOPELOS-EGHORIA") )

In [42]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [43]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [44]:
pattern2 = re.compile(r'([\D]+)([\d]+)')

def get_metadata_row(breed_code, sample_name, df=goat_dataset):    
    match = re.search(pattern2, sample_name)
    farm_coding, number = match.groups()
    
    def in_between(val):
        if len(val) == 2 and type(val[1]) is tuple:
            start, stop = int(val[1][0]), int(val[1][1])
            
            if number >= start and number <= stop:
                return True
            
        return False
    
    if breed_code in ['BOU', 'EGH', 'SKO']:
        number = int(number)
        df["selected"] = df["parsed_coding"].apply(in_between)
        
    else:
        df["selected"] = df["parsed_coding"].apply(lambda val: farm_coding in val)
    
    try:
        index = df[(df['Code'] == breed_code) & (df['selected'] == True)].index.values[0]
        return index
    except IndexError as exc:
        print(breed_code, sample_name, farm_coding)
        raise exc

In [45]:
# samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])
samples["metadata_idx"] = samples[['breed_code', 'sample_name']].apply(lambda df: get_metadata_row(df['breed_code'], df['sample_name']), axis=1)

In [46]:
merged = samples.join(goat_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = Path(dataset.file).stem + ".xlsx"
outpath = get_interim_dir() / outfile
merged.to_excel(str(outpath), index=False)

Add this file to phenotype archive:

In [47]:
phenotype_file.write(outfile, arcname=f"greece_foreground_goat/{outfile}")
outpath.unlink()

Append the fixed and original metadata to the archive

In [48]:
phenotype_file.write(metadata_file, arcname="greece_foreground_goat/greece_foreground_metadata_fix.xlsx")
phenotype_file.write(original_file, arcname="greece_foreground_goat/greece_foreground_metadata.xlsx")

Closing archive file

In [49]:
phenotype_file.close()