# MERINO_INIA_UY
Describing `MERINO_INIA_UY.zip` file

In [1]:
import io
import itertools
import pandas as pd

from zipfile import ZipFile

import src.features.illumina
from src.features.smarterdb import VariantSheep, global_connection, Dataset, Breed
from src.features.utils import get_project_dir
from src.features.plinkio import TextPlinkIO

In [2]:
project_dir = get_project_dir()
datafile = project_dir / "data/raw/background/MERINO_INIA_UY.zip"

In [3]:
handle = ZipFile(datafile)
handle.printdir()

File Name                                             Modified             Size
Merino_21_12_17_OV54k_FinalReport.txt          2020-12-03 14:37:44    383851065
Merino_21_12_17_OV54k_SNP_Map.txt              2020-12-03 14:37:54      3012148
MERINO_UY_96_21_12_17_OV54k.ped                2020-12-09 17:24:20     20831616
MERINO_UY_96_21_12_17_OV54k.map                2020-12-09 17:24:22      1502260


Merino dataset have map/ped with final report and snp map. Get info from final report:

In [4]:
with io.TextIOWrapper(handle.open("Merino_21_12_17_OV54k_FinalReport.txt"), encoding="utf-8") as f:
    for line in itertools.islice(f, 12):
        print(line.strip())

[Header]
GSGT Version	2.0.3
Processing Date	12/21/2017 11:39 AM
Content		ovinesnp50_b.bpm
Num SNPs	54241
Total SNPs	54241
Num Samples	96
Total Samples	96
[Data]
SNP Name	Sample ID	Allele1 - Forward	Allele2 - Forward	Allele1 - AB	Allele2 - AB	Allele1 - Top	Allele2 - Top	GC Score	X	Y	B Allele Freq	Log R Ratio
250506CS3900065000002_1238.1	201711200001	T	C	A	B	A	G	0.9239	0.727	0.647	0.5027	-0.1292
250506CS3900140500001_312.1	201711200001	T	T	A	A	A	A	0.9613	0.654	0.005	0.0000	-0.1364


Data seems to be recent (2017). Are coordinates in latest 3.1?

In [5]:
data_coordinates = dict()
with io.TextIOWrapper(handle.open("MERINO_UY_96_21_12_17_OV54k.map"), encoding="utf-8") as f:
    for record in f:
        record = record.split()
        data_coordinates[record[1]] = (record[0], int(record[3]))

In [6]:
chip_dir = project_dir / "data/external/SHE/ILLUMINA/"
old_chip3_file = chip_dir / "ovinesnp50_b.csv.gz"
old_chip3 = dict()
for record in src.features.illumina.read_Manifest(old_chip3_file, delimiter=","):
    old_chip3[record.name] = (record.chr, record.mapinfo)

Skipping: Illumina, Inc.,,,,,,,,,,,,,,,,,,,
Skipping: [Heading],,,,,,,,,,,,,,,,,,,,
Skipping: Descriptor File Name,OvineSNP50_B.bpm,,,,,,,,,,,,,,,,,,,
Skipping: Assay Format,Infinium HD Ultra,,,,,,,,,,,,,,,,,,,
Skipping: Date Manufactured,1/7/2009,,,,,,,,,,,,,,,,,,,
Skipping: Loci Count ,54241,,,,,,,,,,,,,,,,,,,


Skipping: [Assay],,,,,,,,,,,,,,,,,,,,


In [7]:
count = 0
missing = 0

for key, value in old_chip3.items():
    if key not in data_coordinates:
        missing += 1
        continue

    if value != data_coordinates[key]:
        count += 1
        if count <= 10:
            print(key, value, data_coordinates[key])

print(f"\nN of SNPs in different positions in merino and old chip3: {count}")
print(f"\nN of SNPs in merino not in chip: {missing}")

s17862.1 ('Contig', 0) ('CONTIG', 0)

N of SNPs in different positions in merino and old chip3: 1

N of SNPs in merino not in chip: 0


Despite data were made in 2017, coordinates seems to be in the old reference. Check with database:

In [8]:
global_connection()
for key, value in itertools.islice(data_coordinates.items(), 20):
    qs = VariantSheep.objects(name=key)

    if qs.count() > 0:
        variant = qs.get()
        location = next(filter(lambda loc: loc.imported_from == "SNPchiMp v.3", variant.locations))
        if value[0] != location.chrom or int(value[1]) != location.position:
            print(f"snp {key} with different positions: {value[0]}:{value[1]} <> {location.chrom}:{location.position}")

snp 250506CS3900065000002_1238.1 with different positions: 15:5327353 <> 15:5870057
snp 250506CS3900140500001_312.1 with different positions: 23:27428869 <> 23:26298017
snp 250506CS3900176800001_906.1 with different positions: 7:89002990 <> 7:81648528
snp 250506CS3900211600001_1041.1 with different positions: 16:44955568 <> 16:41355381
snp 250506CS3900218700001_1294.1 with different positions: 2:157820235 <> 2:148802744
snp 250506CS3900283200001_442.1 with different positions: 1:203289635 <> 0:0
snp 250506CS3900371000001_1255.1 with different positions: 11:37632867 <> 11:35339123
snp 250506CS3900386000001_696.1 with different positions: 16:68297712 <> 16:62646307
snp 250506CS3900414400001_1178.1 with different positions: 1:111100644 <> 1:103396552
snp 250506CS3900435700001_1658.1 with different positions: 12:50140951 <> 0:0
snp 250506CS3900464100001_519.1 with different positions: 1:91075445 <> 1:85767398
snp 250506CS3900487100001_1521.1 with different positions: 14:1552575 <> 14:11103

## Fix metadata

There are some animals which are imported (have GPS coordinates not in uruguay). So read from metadata and write a new metadata samples with the proper country. First, get samples and breed from ped file:

In [9]:
dataset = Dataset.objects.get(file="MERINO_INIA_UY.zip")
plinkio = TextPlinkIO(
    prefix=str(dataset.working_dir / "MERINO_UY_96_21_12_17_OV54k"),
    species=dataset.species,
    chip_name=dataset.chip_name)

Now get `fid` and `iid`:

In [10]:
samples = [[fid, iid] for fid, iid, *_ in plinkio.read_pedfile()]
samples[:10]

[['MERINO_UY', '201711200001'],
 ['MERINO_UY', '201711200002'],
 ['MERINO_UY', '201711200003'],
 ['MERINO_UY', '201711200004'],
 ['MERINO_UY', '201711200005'],
 ['MERINO_UY', '201711200006'],
 ['MERINO_UY', '201711200007'],
 ['MERINO_UY', '201711200008'],
 ['MERINO_UY', '201711200009'],
 ['MERINO_UY', '201711200010']]

Ok, now I need to open the proper metadata file, in order to select the animal I need:

In [11]:
metadata = Dataset.objects.get(file="Smarter_Ids_Uploaded_with_GPSCordinates_FINAL.zip")
df = pd.read_excel(metadata.working_dir / "Smarter_Ids_Uploaded_with_GPSCordinates_FINAL.xlsx")
df.head()

Unnamed: 0,ID,Breed,Stall,GPS_Coordinates,GPS_2
0,1,Corriedale,CIEDAG,"-33.86937579589417, -55.57265008365528",https://www.google.com/maps/place/CIEDAG+-+Sec...
1,10,Corriedale,CIEDAG,"-33.86937579589417, -55.57265008365528",https://www.google.com/maps/place/CIEDAG+-+Sec...
2,11,Corriedale,CIEDAG,"-33.86937579589417, -55.57265008365528",https://www.google.com/maps/place/CIEDAG+-+Sec...
3,12,Corriedale,CIEDAG,"-33.86937579589417, -55.57265008365528",https://www.google.com/maps/place/CIEDAG+-+Sec...
4,13,Corriedale,CIEDAG,"-33.86937579589417, -55.57265008365528",https://www.google.com/maps/place/CIEDAG+-+Sec...


Ok now I need to select from metadata the sample I have in this dataset:

In [12]:
samples = pd.DataFrame(samples, columns=["fid", "iid"])
samples.head()

Unnamed: 0,fid,iid
0,MERINO_UY,201711200001
1,MERINO_UY,201711200002
2,MERINO_UY,201711200003
3,MERINO_UY,201711200004
4,MERINO_UY,201711200005


In [13]:
df['ID'] = df['ID'].astype(str)
df_selected = df[df['ID'].isin(samples['iid'])]
print(f"Number of samples in the metadata: {len(df_selected)}")
print(f"Number of samples in the plink file: {len(samples)}")
df_selected.head()

Number of samples in the metadata: 96
Number of samples in the plink file: 96


Unnamed: 0,ID,Breed,Stall,GPS_Coordinates,GPS_2
158,201711200065,Merino,EEFAS,"-31.38766026946753, -57.71601394419916",https://earth.google.com/web/search/Estacion+E...
159,201711200066,Merino,EEFAS,"-31.38766026946753, -57.71601394419916",https://earth.google.com/web/search/Estacion+E...
160,201711200005,Merino,IMPORTADO,"-30.945528469563605, 151.2477073694806",https://www.google.com/maps/place/Nerstane+Mer...
161,201711200012,Merino,IMPORTADO,"-30.945528469563605, 151.2477073694806",https://www.google.com/maps/place/Nerstane+Mer...
162,201711200013,Merino,IMPORTADO,"-30.945528469563605, 151.2477073694806",https://www.google.com/maps/place/Nerstane+Mer...


Select the proper code for this breed:

In [14]:
breed = Breed.objects.get(name="Merino", species="Sheep")
breed.code

'MER'

In [15]:
samples["code"] = breed.code
samples["country"] = "Uruguay"
samples.head()

Unnamed: 0,fid,iid,code,country
0,MERINO_UY,201711200001,MER,Uruguay
1,MERINO_UY,201711200002,MER,Uruguay
2,MERINO_UY,201711200003,MER,Uruguay
3,MERINO_UY,201711200004,MER,Uruguay
4,MERINO_UY,201711200005,MER,Uruguay


Now select all the animals with `IMPORTADO` `Stall` and change country to `Australia`:

In [16]:
samples.loc[samples['iid'].isin(df_selected[df_selected["Stall"] == "IMPORTADO"]['ID']), 'country'] = 'Australia'
print(samples["country"].value_counts())
samples.head()

country
Uruguay      84
Australia    12
Name: count, dtype: int64


Unnamed: 0,fid,iid,code,country
0,MERINO_UY,201711200001,MER,Uruguay
1,MERINO_UY,201711200002,MER,Uruguay
2,MERINO_UY,201711200003,MER,Uruguay
3,MERINO_UY,201711200004,MER,Uruguay
4,MERINO_UY,201711200005,MER,Australia


Write this metadata to a new file:

In [17]:
samples.to_excel(metadata.working_dir / "MERINO_UY_96_21_12_17_OV54k_samples.xlsx", index=False)