# ISHEEP dataset
Attempt to explore the ISHEEP dataset to understand if it can be imported into *SMARTER-database*

In [1]:
import re
import functools
import requests
from pathlib import Path

import pandas as pd
from plinkio import plinkfile
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim

from src.features.utils import get_project_dir

Define some useful functions:

In [2]:
geolocator = Nominatim(user_agent="SMARTER-database")
isheep_data_path = get_project_dir() / "data/external/SHE/ISHEEP"


@functools.cache
def get_location(sample_location):
    location = geolocator.geocode(sample_location)
    
    if not location:
        return None, None, None
        
    return location.address, location.latitude, location.longitude


# https://stackoverflow.com/a/20558779
def stripNone(data):
    if isinstance(data, dict):
        return {k:stripNone(v) for k, v in data.items() if k is not None and v is not None}
    elif isinstance(data, list):
        return [stripNone(item) for item in data if item is not None]
    elif isinstance(data, tuple):
        return tuple(stripNone(item) for item in data if item is not None)
    elif isinstance(data, set):
        return {stripNone(item) for item in data if item is not None}
    else:
        return data

    
@functools.cache
def fetch_cncb_biosample(biosample_id: str):
    response = requests.get(f"https://ngdc.cncb.ac.cn/gwh/api/public/bioSample/{biosample_id}")
    
    if response.status_code != 200:
        return None
    
    data = response.json()
    data = stripNone(data)
    
    if "submitter" in data:
        _ = data.pop("submitter")
    
    if "taxon" in data:
        _ = data.pop("taxon")
    
    return data

@functools.cache
def fetch_cncb_bioproject(bioproject_id):
    response = requests.get(f"https://ngdc.cncb.ac.cn/gwh/api/public/bioProject/{bioproject_id}")
    
    if response.status_code != 200:
        return None
    
    data = response.json()
    data = stripNone(data)
    
    return data

Read all data from *ISHEEP dataset*

In [3]:
tablefile = str(isheep_data_path / "isheep_refined.xlsx")
metadata = pd.read_excel(tablefile, sheet_name=0)

In [4]:
metadata.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


Those are the different data types present in the isheep dataset

In [5]:
metadata["Assay type"].unique()

array(['WGS', '50K chip', '600K chip'], dtype=object)

# 50K data
ok, let's subset the 50k dataset

In [6]:
isheep_50K = metadata[metadata["Assay type"] == "50K chip"].copy()

In [7]:
isheep_50K.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 355 to 1866
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        1512 non-null   object
 1   Biosample ID     1512 non-null   object
 2   Bioproject ID    1512 non-null   object
 3   Species          1512 non-null   object
 4   Breed            1512 non-null   object
 5   Sex              419 non-null    object
 6   Sample location  1512 non-null   object
 7   Material         710 non-null    object
 8   Technology       1512 non-null   object
 9   Assay type       1512 non-null   object
 10  Coverage         0 non-null      object
 11  data resource    1512 non-null   object
dtypes: object(12)
memory usage: 153.6+ KB


In [8]:
isheep_50K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
355,ARG10,SAMC060792,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
356,ARG11,SAMC060793,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
357,ARG9,SAMC060794,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
358,ARG6,SAMC060795,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
359,ARG8,SAMC060796,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"


There are species different from *Ovis Aries* in dataset

In [9]:
isheep_50K["Species"].unique()

array(['Ovis ammon', 'Ovis orientalis', 'Ovis aries'], dtype=object)

Get the different *breeds* from dataset. This dataset was modified with [OpenRefine](https://openrefine.org/)

In [10]:
isheep_50K["Breed"].unique()

array(['Argali', 'Mouflon', 'Baerchuke', 'Bashbay', 'Celei black',
       'Diqing', 'Guide Black Fur', 'Guangling fat-tail', 'Hulun Buir',
       'Hetian', 'Hanzhong', 'Jingzhong', 'Kirghiz',
       'Lanzhou Large-tailed', 'Lop', 'Luzhong Mountain',
       'Minxian Black Fur', 'Ninglang Black', 'Sunite', 'Shiping Gray',
       'Tan', 'Tengchong', 'Taihang Fur', 'Turfan Black', 'Tong',
       'Tashkurgan', 'Lanping Black-bone', 'Weining', 'Wuranke',
       'Ujimqin', 'Yecheng', 'Yuxi Fat-tailed', 'Zhaotong', 'Altay',
       'Bayinbuluke', 'Tibetan', 'Duolang', 'Kazakh'], dtype=object)

Try to collect data from CNCB biosample or load data from file

In [11]:
biosample_pkl = isheep_data_path / "isheep_50K_biosample.pkl"

if (biosample_pkl).exists():
    isheep_50K_biosample = pd.read_pickle(biosample_pkl)
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_50K["Biosample ID"])]
    isheep_50K_biosample = pd.json_normalize(biosample_data)
    isheep_50K_biosample.to_pickle(biosample_pkl)
    isheep_50K_biosample.to_excel(str(isheep_data_path / "isheep_50K_biosample.xlsx"))

In [12]:
isheep_50K_biosample.head()

Unnamed: 0,accession,message,name,sampleId,title,userId,sampleAttribute.attributeId,sampleAttribute.biomaterialProvider,sampleAttribute.breed,sampleAttribute.sample.sampleId,sampleAttribute.sex,sampleAttribute.taxon.name,sampleAttribute.taxon.taxonId,sampleAttribute.tissue,sampletype.attributeTable,sampletype.sampleTypeId,sampletype.sampleTypeName,sampleAttribute.birthLocation,sampleAttribute.geographicLocation
0,SAMC060792,SUCCESS,ARG10,63830,we get the Illumina Ovine SNP50K BeadChip from...,987,8594,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63830,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
1,SAMC060793,SUCCESS,ARG11,63831,we get the Illumina Ovine SNP50K BeadChip from...,987,8595,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63831,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
2,SAMC060794,SUCCESS,ARG9,63832,we get the Illumina Ovine SNP50K BeadChip from...,987,8596,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63832,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
3,SAMC060795,SUCCESS,ARG6,63833,we get the Illumina Ovine SNP50K BeadChip from...,987,8597,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63833,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
4,SAMC060796,SUCCESS,ARG8,63834,we get the Illumina Ovine SNP50K BeadChip from...,987,8598,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63834,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,


Data collected from biosamples are not different from data collected from metadata. Try to get information from BioProject:

In [13]:
bioproject_pkl = isheep_data_path / "isheep_50K_bioproject.pkl"

if Path(bioproject_pkl).exists():
    isheep_50K_bioproject = pd.read_pickle(bioproject_pkl)
else:
    bioproject_data = [fetch_cncb_bioproject(bioproject_id) for bioproject_id in tqdm(isheep_50K["Bioproject ID"])]
    isheep_50K_bioproject = pd.json_normalize(bioproject_data)
    isheep_50K_bioproject.to_pickle(bioproject_pkl)
    isheep_50K_bioproject.to_excel(str(isheep_data_path / "isheep_50K_bioproject.xlsx"))

In [23]:
isheep_50K_bioproject = isheep_50K_bioproject.drop_duplicates(subset=["prjAccession"])
isheep_50K_bioproject.head()

Unnamed: 0,biomaterialProvider,dataTypes,description,listConsortium,listDataProviders,listExternalLinks,listGrants,listPublication,listRefProjects,message,...,submitter.firstName,submitter.lastName,submitter.middleName,submitter.organization,submitter.phone,submitter.postalCode,submitter.state,submitter.street,submitter.submitOrganizationUrl,submitter.submitterId
0,Juha Kantanen,"[{'dataTypeId': 7, 'dataTypeName': 'Phenotype ...",The genome landscape of Tibetan sheep reveals ...,[],[],[],[{'agency': 'National Natural Science Foundati...,[{'articleTitle': 'Whole-Genome Sequencing of ...,[],SUCCESS,...,Qianghui,Zhu,,"Institute of Zoology, Chinese Academy of Sciences",15071296586,100101,,"beichen west road, Chaoyang District",http://www.ioz.ac.cn/,7187


Try to open the processed plink file and check that all sample ids are represented:

In [17]:
plinkio = plinkfile.open(str(isheep_data_path / "50K-all/50K-all"))

In [18]:
samples = [sample.iid for sample in plinkio.get_samples()]
for index, row in isheep_50K.iterrows():
    if not row["Sample ID"] in samples:
        print(f"Missing {row['Sample ID']}")

Sample names between metadata and plink file are identical. Try to derive sample location using geocoding:

In [19]:
test = isheep_50K["Sample location"].apply(get_location)
# isheep_50K["address"] = test.apply(lambda x: x[0])
isheep_50K["latitude"] = test.apply(lambda x: x[1])
isheep_50K["longitude"] = test.apply(lambda x: x[2])
isheep_50K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource,latitude,longitude
355,ARG10,SAMC060792,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
356,ARG11,SAMC060793,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
357,ARG9,SAMC060794,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
358,ARG6,SAMC060795,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
359,ARG8,SAMC060796,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346


Try to merge this table with biosample information:

In [25]:
isheep_50K= pd.merge(isheep_50K, isheep_50K_biosample, left_on="Biosample ID", right_on="accession", how="inner")

In [27]:
isheep_50K.to_excel(str(isheep_data_path / "isheep_50K.xlsx"))

Despite is possible to derive coordinates from the location I have, I don't want to place a latitude and longitude in my database: if anyone want to consider coordinates from location, need to understand the coordinate precision level (I can't consider russia coordinate at the same level of a cina county)

## 600K data
next, explore the 600k dataset

In [None]:
isheep_600K = metadata[metadata["Assay type"] == "600k chip"]
isheep_600K.info()

In [None]:
isheep_600K.head()

In [None]:
isheep_600K["Species"].unique()

In [None]:
isheep_600K["Breed"].unique()

Try to collect data from CNCB biosample or load data from file

In [None]:
if Path("isheep_600K_biosample.pkl").exists():
    isheep_600K_biosample = pd.read_pickle("isheep_600K_biosample.pkl")
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_600K["Biosample ID"])]
    isheep_600K_biosample = pd.json_normalize(biosample_data)
    isheep_600K_biosample.to_pickle("isheep_600K_biosample.pkl")
    isheep_600K_biosample.to_excel("isheep_600K_biosample.xlsx")

This time I can find coordinates in Biosamples in NE format (ie *38.37 N 77.23 E*). The sample name is the same I found in datafile. Need to define a sample alias equal to the sample name I find in plinkfile. Try to open the processed plink file and check that all sample ids are represented

In [None]:
plinkio = plinkfile.open(str(get_project_dir() / "data/external/SHE/ISHEEP/600K-all/600K-all"))

Check that sample names in plink file and metadata are the same

In [None]:
samples = [sample.iid for sample in plinkio.get_samples()]
count = 0

for index, row in isheep_600K.iterrows():
    if not row["Sample ID"] in samples:
        # print(f"Missing {row['Sample ID']}")
        count += 1
        
if count > 0:
    print(f"{count} samples name don't match between metadata and plink file")

In [None]:
test = pd.DataFrame({"samples_id": isheep_600K["Sample ID"], "samples_vcf": samples})
if not Path("samples_id-vcf-600K.xlsx").exists():
    test.to_excel("samples_id-vcf-600K.xlsx")

Save only 600K sample in a different file

In [None]:
isheep_600K.to_excel("isheep_600K.xlsx")

## WGS dataset
finally, get WGS metadata:

In [None]:
isheep_WGS = metadata[metadata["Assay type"] == "WGS"].copy()
isheep_WGS.info()

In [None]:
isheep_WGS.head()

In [None]:
isheep_WGS = metadata[metadata["Assay type"] == "WGS"]
isheep_WGS.info()

In [None]:
isheep_WGS.head()

In [None]:
isheep_WGS["Species"].unique()

In [None]:
isheep_WGS["Breed"].unique()

There are some breeds from WGS data that are unknown. Should I consider such animals?

In [None]:
isheep_WGS[isheep_WGS["Breed"] == " -"]

Some breeds are missing also in the original dataset, for example the "*IROA\**" samples (which seems to come from nextgen dataset):

In [None]:
nextgenfile = str(get_project_dir() / "data/external/SHE/ISHEEP/ovis.sample_index")
nextgen = pd.read_table(nextgenfile, skiprows=10)
nextgen.head()

However, I could collect coordinates from this datafile

Try to collect data from CNCB biosample or load data from file

In [None]:
if Path("isheep_WGS_biosample.pkl").exists():
    isheep_WGS_biosample = pd.read_pickle("isheep_WGS_biosample.pkl")
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_WGS["Biosample ID"])]
    isheep_WGS_biosample = pd.json_normalize(biosample_data)
    isheep_WGS_biosample.to_pickle("isheep_WGS_biosample.pkl")
    isheep_WGS_biosample.to_excel("isheep_WGS_biosample.xlsx")

In [None]:
plinkio = plinkfile.open(str(get_project_dir() / "data/external/SHE/ISHEEP/WGS-all/WGS-all.smarter"))

There are some samples which I can't find in cncb biosamples: They are annotated using SRA accession and so I can't fetch a record for more than 100 samples. Other samples seems to have geographic coordinates in the same format of 600K. Check that sample names in plink file and metadata are the same

In [None]:
samples = [sample.iid for sample in plinkio.get_samples()]
count = 0

for index, row in isheep_WGS.iterrows():
    if not row["Sample ID"] in samples:
        # print(f"Missing {row['Sample ID']}")
        count += 1
        
if count > 0:
    print(f"{count} samples name don't match between metadata and plink file")
        

In [None]:
test = pd.DataFrame({"samples_id": isheep_WGS["Sample ID"], "samples_vcf": samples})
if not Path("samples_id-vcf-WGS.xlsx").exists():
    test.to_excel("samples_id-vcf-WGS.xlsx")

IDs are different in metadata and datafiles. Save only WGS sample in a different file

In [None]:
isheep_WGS.to_excel("isheep_WGS.xlsx")