# Exploring greek data
* [AUTH_OVN50KV2_CHI_FRI](#dataset0)
* [AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA](#dataset1)
* [AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO](#dataset2)
* [AUTH_GOAT53KV1_EGHORIA_SKOPELOS](#dataset3)

In [1]:
import re
import csv
from pathlib import Path

import pandas as pd
import numpy as np
from plinkio import plinkfile

from src.features.smarterdb import global_connection, Dataset
from src.features.utils import get_raw_dir

_ = global_connection()

Read the full metadata file

In [2]:
metadata_file = get_raw_dir() / "greece_foreground_metadata_fix.xlsx"
sheep_dataset = pd.read_excel(metadata_file, sheet_name="sheep")

Read *Farm coding* column in a more useful way:

In [3]:
pattern1 = re.compile(r'([\D]+)([\d]+)-([\D]+)([\d]+)')

def custom_split(val):
    val = [el.strip() for el in val.split(",")]
    match = re.search(pattern1, val[0])
    if match:
        try:
            code, start, _, stop = match.groups()
        except ValueError as exc:
            print(match.groups(), val)
            raise exc
        val = [code, (start, stop)]
    return val

sheep_dataset["parsed_coding"] = sheep_dataset["Farm Coding"].apply(custom_split)

Get the greek foreground datasets for *Sheep*

In [4]:
datasets = Dataset.objects.filter(country="Greece", type_="foreground", species="Sheep")

<a id='dataset0'></a>
## AUTH_OVN50KV2_CHI_FRI
ok start from the first dataset I have:

In [5]:
dataset = datasets[0]

Open plink file

In [6]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_OVN50KV2_CHIOS_FRIZARTA/AUTH_OVN50KV2_CHI_FRI") )

Read samples name and their breed code from *ped*. Create a *dataframe*

In [7]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [8]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

Now create a function able to retrive a metadata row relying on sample name:

In [9]:
pattern2 = re.compile(r'([\D]+)([\d]+)')

def get_metadata_row(breed_code, sample_name, df=sheep_dataset):    
    match = re.search(pattern2, sample_name)
    farm_coding, number = match.groups()
    
    def in_between(val):
        if len(val) == 2 and type(val[1]) is tuple:
            start, stop = int(val[1][0]), int(val[1][1])
            
            if number >= start and number <= stop:
                return True
            
        return False
    
    if breed_code in ['BOU']:
        number = int(number)
        df["selected"] = df["parsed_coding"].apply(in_between)
        
    else:
        df["selected"] = df["parsed_coding"].apply(lambda val: farm_coding in val)
    
    try:
        index = df[(df['Code'] == breed_code) & (df['selected'] == True)].index.values[0]
        return index
    except IndexError as exc:
        print(breed_code, sample_name, farm_coding)

Apply such function to samples dataframe and create a new column with the metadata index row:

In [10]:
samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])

Now drop unused columns and save a metadata file in *working directory*:

In [11]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = str(dataset.working_dir / Path(dataset.file).stem) + ".xlsx"
merged.to_excel(outfile, index=False)

<a id='dataset1'></a>
## AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA

Same stuff for the 2nd dataset:

In [12]:
dataset = datasets[1]

In [13]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA/AUTH_OVN50KV2_CHIOS_FRIZARTA_PELAGONIA") )

In [14]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [15]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [16]:
samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])

In [17]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = str(dataset.working_dir / Path(dataset.file).stem) + ".xlsx"
merged.to_excel(outfile, index=False)

<a id='dataset2'></a>
## AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO

And now the latest dataset:

In [18]:
dataset = datasets[2]

This dataset has SNPs with extra chroms. So, the only way to get a sample list is to parse the first two columns of `.fam` file:

In [19]:
fam_file = dataset.working_dir / "AUTH_OVN50KV2_CHIOS_MYTILINI_BOUTSKO/AUTH_OVN50KV02_CHI_MYT_BOU.fam"
with open(fam_file) as handle:
    reader = csv.reader(handle, delimiter="\t")
    samples = [[sample[0], sample[1]] for sample in reader]

In [20]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [21]:
samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])

In [22]:
merged = samples.join(sheep_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = str(dataset.working_dir / Path(dataset.file).stem) + ".xlsx"
merged.to_excel(outfile, index=False)

<a id='dataset3'></a>
## AUTH_GOAT53KV1_EGHORIA_SKOPELOS
Goat belong to a different dataset

In [23]:
dataset = Dataset.objects.get(country="Greece", type_="foreground", species="Goat")

Read goat metadata:

In [24]:
goat_dataset = pd.read_excel(metadata_file, sheet_name="goat")

In [25]:
goat_dataset["parsed_coding"] = goat_dataset["Farm Coding"].apply(custom_split)

In [26]:
plink_file = plinkfile.open( str(dataset.working_dir / "AUTH_GOAT53KV1_EGHORIA_SKOPELOS/AUTH_GOAT53KV1_SKOPELOS-EGHORIA") )

In [27]:
samples = [(sample.fid, sample.iid) for sample in plink_file.get_samples()]

In [28]:
samples = pd.DataFrame(data=samples, columns=["breed_code", "sample_name"])

In [29]:
pattern2 = re.compile(r'([\D]+)([\d]+)')

def get_metadata_row(breed_code, sample_name, df=goat_dataset):    
    match = re.search(pattern2, sample_name)
    farm_coding, number = match.groups()
    
    def in_between(val):
        if len(val) == 2 and type(val[1]) is tuple:
            start, stop = int(val[1][0]), int(val[1][1])
            
            if number >= start and number <= stop:
                return True
            
        return False
    
    if breed_code in ['BOU', 'EGH', 'SKO']:
        number = int(number)
        df["selected"] = df["parsed_coding"].apply(in_between)
        
    else:
        df["selected"] = df["parsed_coding"].apply(lambda val: farm_coding in val)
    
    try:
        index = df[(df['Code'] == breed_code) & (df['selected'] == True)].index.values[0]
        return index
    except IndexError as exc:
        # print(breed_code, sample_name, farm_coding)
        return -1

In [30]:
samples["metadata_idx"] = np.vectorize(get_metadata_row)(samples['breed_code'], samples['sample_name'])

  outputs = ufunc(*inputs)


In [31]:
merged = samples.join(goat_dataset, on="metadata_idx")
merged = merged.drop("selected", axis=1)
merged = merged.drop("parsed_coding", axis=1)
merged = merged.drop("metadata_idx", axis=1)

outfile = str(dataset.working_dir / Path(dataset.file).stem) + ".xlsx"
merged.to_excel(outfile, index=False)