# ISHEEP dataset
Attempt to explore the ISHEEP dataset to understand if it can be imported into *SMARTER-database*

In [1]:
import re
import functools
import requests
from pathlib import Path

import pandas as pd
from plinkio import plinkfile
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim
from geopy.point import Point

from src.features.utils import get_project_dir

Define some useful functions:

In [2]:
geolocator = Nominatim(user_agent="SMARTER-database")
isheep_data_path = get_project_dir() / "data/external/SHE/ISHEEP"


@functools.cache
def get_location(sample_location):
    location = geolocator.geocode(sample_location)
    
    if not location:
        return None, None, None
        
    return location.address, location.latitude, location.longitude


# https://stackoverflow.com/a/20558779
def stripNone(data):
    if isinstance(data, dict):
        return {k:stripNone(v) for k, v in data.items() if k is not None and v is not None}
    elif isinstance(data, list):
        return [stripNone(item) for item in data if item is not None]
    elif isinstance(data, tuple):
        return tuple(stripNone(item) for item in data if item is not None)
    elif isinstance(data, set):
        return {stripNone(item) for item in data if item is not None}
    else:
        return data

    
@functools.cache
def fetch_cncb_biosample(biosample_id: str):
    response = requests.get(f"https://ngdc.cncb.ac.cn/gwh/api/public/bioSample/{biosample_id}")
    
    if response.status_code != 200:
        return None
    
    data = response.json()
    data = stripNone(data)
    
    if "submitter" in data:
        _ = data.pop("submitter")
    
    if "taxon" in data:
        _ = data.pop("taxon")
    
    return data


@functools.cache
def fetch_cncb_bioproject(bioproject_id):
    response = requests.get(f"https://ngdc.cncb.ac.cn/gwh/api/public/bioProject/{bioproject_id}")
    
    if response.status_code != 200:
        return None
    
    data = response.json()
    data = stripNone(data)
    
    return data


def stripTextEBI(data):
    """Try to clean EBI data"""
    if isinstance(data, dict):
        if "text" in data:
            return data["text"]
        
        elif "url" in data:
            return data["url"]
        else:
            return {k:stripTextEBI(v) for k, v in data.items() if k is not None and v is not None}
    elif isinstance(data, list):
        return stripTextEBI(data[0])
    
    else:
        return data
        

@functools.cache
def fetch_ebi_biosample(text: str):
    response = requests.get(f"https://www.ebi.ac.uk/biosamples/samples?filter=attr:SRA%20accession:{text}")
    
    if response.status_code != 200:
        return None
    
    data = response.json()
    data = stripNone(data)
    
    # how mant results I got?
    totalElements = data['page']['totalElements']
    
    if totalElements == 0:
        return {"accession": text}
    
    elif totalElements == 1:
        sample = data['_embedded']['samples'][0]
        _ = sample.pop("_links")
        return stripTextEBI(sample)
    
    else:
        raise Exception(f"Found {totalElements} results for 'text'")

In [3]:
response = requests.get(f"https://www.ebi.ac.uk/biosamples/samples?text=ERS1460923")

In [4]:
data = response.json()
data = stripNone(data)
sample = data['_embedded']['samples'][0]
_ = sample.pop("_links")
sample = stripTextEBI(sample)
sample['characteristics']['SRA accession']

'ERS1460923'

Read all data from *ISHEEP dataset*

In [5]:
tablefile = str(isheep_data_path / "isheep_refined.xlsx")
metadata = pd.read_excel(tablefile, sheet_name=0)

In [6]:
metadata.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


Those are the different data types present in the isheep dataset

In [7]:
metadata["Assay type"].unique()

array(['WGS', '50K chip', '600K chip'], dtype=object)

# 50K data
ok, let's subset the 50k dataset

In [8]:
isheep_50K = metadata[metadata["Assay type"] == "50K chip"].copy()

In [9]:
isheep_50K.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 355 to 1866
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        1512 non-null   object
 1   Biosample ID     1512 non-null   object
 2   Bioproject ID    1512 non-null   object
 3   Species          1512 non-null   object
 4   Breed            1512 non-null   object
 5   Sex              419 non-null    object
 6   Sample location  1512 non-null   object
 7   Material         710 non-null    object
 8   Technology       1512 non-null   object
 9   Assay type       1512 non-null   object
 10  Coverage         0 non-null      object
 11  data resource    1512 non-null   object
dtypes: object(12)
memory usage: 153.6+ KB


In [10]:
isheep_50K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
355,ARG10,SAMC060792,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
356,ARG11,SAMC060793,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
357,ARG9,SAMC060794,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
358,ARG6,SAMC060795,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
359,ARG8,SAMC060796,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"


There are species different from *Ovis Aries* in dataset

In [11]:
isheep_50K["Species"].unique()

array(['Ovis ammon', 'Ovis orientalis', 'Ovis aries'], dtype=object)

Get the different *breeds* from dataset. This dataset was modified with [OpenRefine](https://openrefine.org/)

In [12]:
isheep_50K["Breed"].unique()

array(['Argali', 'Mouflon', 'Baerchuke', 'Bashbay', 'Celei black',
       'Diqing', 'Guide Black Fur', 'Guangling fat-tail', 'Hulun Buir',
       'Hetian', 'Hanzhong', 'Jingzhong', 'Kirghiz',
       'Lanzhou Large-tailed', 'Lop', 'Luzhong Mountain',
       'Minxian Black Fur', 'Ninglang Black', 'Sunite', 'Shiping Gray',
       'Tan', 'Tengchong', 'Taihang Fur', 'Turfan Black', 'Tong',
       'Tashkurgan', 'Lanping Black-bone', 'Weining', 'Wuranke',
       'Ujimqin', 'Yecheng', 'Yuxi Fat-tailed', 'Zhaotong', 'Altay',
       'Bayinbuluke', 'Tibetan', 'Duolang', 'Kazakh'], dtype=object)

Try to collect data from CNCB biosample or load data from file

In [13]:
biosample_pkl = isheep_data_path / "isheep_50K_biosample.pkl"

if biosample_pkl.exists():
    isheep_50K_biosample = pd.read_pickle(biosample_pkl)
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_50K["Biosample ID"])]
    isheep_50K_biosample = pd.json_normalize(biosample_data)
    isheep_50K_biosample.to_pickle(biosample_pkl)
    isheep_50K_biosample.to_excel(str(isheep_data_path / "isheep_50K_biosample.xlsx"))

In [14]:
isheep_50K_biosample.head()

Unnamed: 0,accession,message,name,sampleId,title,userId,sampleAttribute.attributeId,sampleAttribute.biomaterialProvider,sampleAttribute.breed,sampleAttribute.sample.sampleId,sampleAttribute.sex,sampleAttribute.taxon.name,sampleAttribute.taxon.taxonId,sampleAttribute.tissue,sampletype.attributeTable,sampletype.sampleTypeId,sampletype.sampleTypeName,sampleAttribute.birthLocation,sampleAttribute.geographicLocation
0,SAMC060792,SUCCESS,ARG10,63830,we get the Illumina Ovine SNP50K BeadChip from...,987,8594,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63830,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
1,SAMC060793,SUCCESS,ARG11,63831,we get the Illumina Ovine SNP50K BeadChip from...,987,8595,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63831,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
2,SAMC060794,SUCCESS,ARG9,63832,we get the Illumina Ovine SNP50K BeadChip from...,987,8596,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63832,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
3,SAMC060795,SUCCESS,ARG6,63833,we get the Illumina Ovine SNP50K BeadChip from...,987,8597,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63833,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,
4,SAMC060796,SUCCESS,ARG8,63834,we get the Illumina Ovine SNP50K BeadChip from...,987,8598,"Genomics and Animal Breeding Research Group, C...",Argali (Ovis ammon),63834,6,Ovis ammon,30527,Ear,sample_attr_model_animal,4,Model organism or animal sample,,


Data collected from biosamples are not different from data collected from metadata. Try to get information from BioProject:

In [15]:
bioproject_pkl = isheep_data_path / "isheep_50K_bioproject.pkl"

if bioproject_pkl.exists():
    isheep_50K_bioproject = pd.read_pickle(bioproject_pkl)
else:
    bioproject_data = [fetch_cncb_bioproject(bioproject_id) for bioproject_id in tqdm(isheep_50K["Bioproject ID"])]
    isheep_50K_bioproject = pd.json_normalize(bioproject_data)
    isheep_50K_bioproject.to_pickle(bioproject_pkl)
    isheep_50K_bioproject.to_excel(str(isheep_data_path / "isheep_50K_bioproject.xlsx"))

In [16]:
isheep_50K_bioproject = isheep_50K_bioproject.drop_duplicates(subset=["prjAccession"])
isheep_50K_bioproject.head()

Unnamed: 0,biomaterialProvider,dataTypes,description,listConsortium,listDataProviders,listExternalLinks,listGrants,listPublication,listRefProjects,message,...,submitter.firstName,submitter.lastName,submitter.middleName,submitter.organization,submitter.phone,submitter.postalCode,submitter.state,submitter.street,submitter.submitOrganizationUrl,submitter.submitterId
0,Juha Kantanen,"[{'dataTypeId': 7, 'dataTypeName': 'Phenotype ...",The genome landscape of Tibetan sheep reveals ...,[],[],[],[{'agency': 'National Natural Science Foundati...,[{'articleTitle': 'Whole-Genome Sequencing of ...,[],SUCCESS,...,Qianghui,Zhu,,"Institute of Zoology, Chinese Academy of Sciences",15071296586,100101,,"beichen west road, Chaoyang District",http://www.ioz.ac.cn/,7187


Try to open the processed plink file and check that all sample ids are represented:

In [17]:
plinkio = plinkfile.open(str(isheep_data_path / "50K-all/50K-all"))

In [18]:
samples = [sample.iid for sample in plinkio.get_samples()]
for index, row in isheep_50K.iterrows():
    if not row["Sample ID"] in samples:
        print(f"Missing {row['Sample ID']}")

Sample names between metadata and plink file are identical. Try to derive sample location using geocoding:

In [19]:
test = isheep_50K["Sample location"].apply(get_location)
# isheep_50K["address"] = test.apply(lambda x: x[0])
isheep_50K["latitude"] = test.apply(lambda x: x[1])
isheep_50K["longitude"] = test.apply(lambda x: x[2])
isheep_50K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource,latitude,longitude
355,ARG10,SAMC060792,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
356,ARG11,SAMC060793,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
357,ARG9,SAMC060794,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
358,ARG6,SAMC060795,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346
359,ARG8,SAMC060796,PRJCA001252,Ovis ammon,Argali,,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017",42.480495,85.463346


Try to merge this table with biosample information:

In [20]:
isheep_50K = pd.merge(isheep_50K, isheep_50K_biosample, left_on="Biosample ID", right_on="accession", how="inner")

In [21]:
isheep_50K.to_excel(str(isheep_data_path / "isheep_50K.xlsx"))

Despite is possible to derive coordinates from the location I have, I don't want to place a latitude and longitude in my database: if anyone want to consider coordinates from location, need to understand the coordinate precision level (I can't consider russia coordinate at the same level of a cina county)

## 600K data
next, explore the 600k dataset

In [22]:
isheep_600K = metadata[metadata["Assay type"] == "600K chip"].copy()
isheep_600K.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 911 entries, 1867 to 2777
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        911 non-null    object
 1   Biosample ID     911 non-null    object
 2   Bioproject ID    911 non-null    object
 3   Species          911 non-null    object
 4   Breed            911 non-null    object
 5   Sex              911 non-null    object
 6   Sample location  911 non-null    object
 7   Material         911 non-null    object
 8   Technology       911 non-null    object
 9   Assay type       911 non-null    object
 10  Coverage         0 non-null      object
 11  data resource    911 non-null    object
dtypes: object(12)
memory usage: 92.5+ KB


In [23]:
isheep_600K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
1867,DLS DLS355,SAMC062304,PRJCA001253,Ovis aries,Duolang,Female,"Aksu region of Xinjiang province, China",Ear,Illumina Ovine Infinium HD SNP BeadChip,600K chip,,"Gao et al., 2018"
1868,DLS DLS.DLS249,SAMC062305,PRJCA001253,Ovis aries,Duolang,Female,"Aksu region of Xinjiang province, China",Ear,Illumina Ovine Infinium HD SNP BeadChip,600K chip,,"Gao et al., 2018"
1869,DLS DLS.DLS302,SAMC062306,PRJCA001253,Ovis aries,Duolang,Male,"Aksu region of Xinjiang province, China",Ear,Illumina Ovine Infinium HD SNP BeadChip,600K chip,,"Gao et al., 2018"
1870,DLS DLS.DLS305,SAMC062307,PRJCA001253,Ovis aries,Duolang,Male,"Aksu region of Xinjiang province, China",Ear,Illumina Ovine Infinium HD SNP BeadChip,600K chip,,"Gao et al., 2018"
1871,DLS DLS.DLS309,SAMC062308,PRJCA001253,Ovis aries,Duolang,Male,"Aksu region of Xinjiang province, China",Ear,Illumina Ovine Infinium HD SNP BeadChip,600K chip,,"Gao et al., 2018"


In [24]:
isheep_600K["Species"].unique()

array(['Ovis aries'], dtype=object)

In [25]:
isheep_600K["Breed"].unique()

array(['Duolang', 'Finnsheep', 'Texel', 'Large Tailed Han', 'Hu', 'Wadi',
       'Icelandic', 'Romanov', 'Sishui Fur', 'Small Tailed Han'],
      dtype=object)

Try to collect data from CNCB biosample or load data from file

In [26]:
biosample_pkl = isheep_data_path / "isheep_600K_biosample.pkl"

if biosample_pkl.exists():
    isheep_600K_biosample = pd.read_pickle(biosample_pkl)
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_600K["Biosample ID"])]
    isheep_600K_biosample = pd.json_normalize(biosample_data)
    isheep_600K_biosample.to_pickle(biosample_pkl)
    isheep_600K_biosample.to_excel(str(isheep_data_path / "isheep_600K_biosample.xlsx"))

In [27]:
isheep_600K_biosample.head()

Unnamed: 0,accession,message,name,sampleId,title,userId,sampleAttribute.attributeId,sampleAttribute.biomaterialProvider,sampleAttribute.breed,sampleAttribute.breedHistory,sampleAttribute.geographicLocation,sampleAttribute.latitudeLongitude,sampleAttribute.sample.sampleId,sampleAttribute.sex,sampleAttribute.taxon.name,sampleAttribute.taxon.taxonId,sampleAttribute.tissue,sampletype.attributeTable,sampletype.sampleTypeId,sampletype.sampleTypeName
0,SAMC062304,SUCCESS,DLS DLS355,65342,we get the Ovine Infinium HD SNP BeadChip from...,987,10106,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,East and Central Asia,38.37 N 77.23 E,65342,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample
1,SAMC062305,SUCCESS,DLS DLS.DLS249,65343,we get the Ovine Infinium HD SNP BeadChip from...,987,10107,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,East and Central Asia,38.37 N 77.23 E,65343,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample
2,SAMC062306,SUCCESS,DLS DLS.DLS302,65344,we get the Ovine Infinium HD SNP BeadChip from...,987,10108,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,East and Central Asia,38.37 N 77.23 E,65344,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample
3,SAMC062307,SUCCESS,DLS DLS.DLS305,65345,we get the Ovine Infinium HD SNP BeadChip from...,987,10109,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,East and Central Asia,38.37 N 77.23 E,65345,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample
4,SAMC062308,SUCCESS,DLS DLS.DLS309,65346,we get the Ovine Infinium HD SNP BeadChip from...,987,10110,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,East and Central Asia,38.37 N 77.23 E,65346,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample


This time I can find coordinates in Biosamples in NE format (ie *38.37 N 77.23 E*). The sample name is the same I found in datafile. Try to split geographic coordinates using geopy:

In [28]:
isheep_600K_biosample["latitude"] = isheep_600K_biosample["sampleAttribute.latitudeLongitude"].apply(lambda x: Point(x).latitude)
isheep_600K_biosample["longitude"] = isheep_600K_biosample["sampleAttribute.latitudeLongitude"].apply(lambda x: Point(x).longitude)
isheep_600K_biosample.head()

Unnamed: 0,accession,message,name,sampleId,title,userId,sampleAttribute.attributeId,sampleAttribute.biomaterialProvider,sampleAttribute.breed,sampleAttribute.breedHistory,...,sampleAttribute.sample.sampleId,sampleAttribute.sex,sampleAttribute.taxon.name,sampleAttribute.taxon.taxonId,sampleAttribute.tissue,sampletype.attributeTable,sampletype.sampleTypeId,sampletype.sampleTypeName,latitude,longitude
0,SAMC062304,SUCCESS,DLS DLS355,65342,we get the Ovine Infinium HD SNP BeadChip from...,987,10106,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,...,65342,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample,38.37,77.23
1,SAMC062305,SUCCESS,DLS DLS.DLS249,65343,we get the Ovine Infinium HD SNP BeadChip from...,987,10107,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,...,65343,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample,38.37,77.23
2,SAMC062306,SUCCESS,DLS DLS.DLS302,65344,we get the Ovine Infinium HD SNP BeadChip from...,987,10108,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,...,65344,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample,38.37,77.23
3,SAMC062307,SUCCESS,DLS DLS.DLS305,65345,we get the Ovine Infinium HD SNP BeadChip from...,987,10109,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,...,65345,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample,38.37,77.23
4,SAMC062308,SUCCESS,DLS DLS.DLS309,65346,we get the Ovine Infinium HD SNP BeadChip from...,987,10110,"Genomics and Animal Breeding Research Group, C...",Duolang sheep,China,...,65346,6,Ovis aries,9940,missing,sample_attr_model_animal,4,Model organism or animal sample,38.37,77.23


In [29]:
bioproject_pkl = isheep_data_path / "isheep_600K_bioproject.pkl"

if bioproject_pkl.exists():
    isheep_600K_bioproject = pd.read_pickle(bioproject_pkl)
else:
    bioproject_data = [fetch_cncb_bioproject(bioproject_id) for bioproject_id in tqdm(isheep_600K["Bioproject ID"])]
    isheep_600K_bioproject = pd.json_normalize(bioproject_data)
    isheep_600K_bioproject.to_pickle(bioproject_pkl)
    isheep_600K_bioproject.to_excel(str(isheep_data_path / "isheep_600K_bioproject.xlsx"))

In [30]:
isheep_600K_bioproject = isheep_600K_bioproject.drop_duplicates(subset=["prjAccession"])
isheep_600K_bioproject.head()

Unnamed: 0,biomaterialProvider,dataTypes,description,listConsortium,listDataProviders,listExternalLinks,listGrants,listPublication,listRefProjects,message,...,submitter.firstName,submitter.lastName,submitter.middleName,submitter.organization,submitter.phone,submitter.postalCode,submitter.state,submitter.street,submitter.submitOrganizationUrl,submitter.submitterId
0,,"[{'dataTypeId': 7, 'dataTypeName': 'Phenotype ...",Genome-wide association analysis and Illumina ...,[],[],[],[{'agency': ' Special Project for Innovation o...,[{'articleTitle': 'A genome-wide association s...,[],SUCCESS,...,Qianghui,Zhu,,"Institute of Zoology, Chinese Academy of Sciences",15071296586,100101,,"beichen west road, Chaoyang District",http://www.ioz.ac.cn/,7224


Try to open the processed plink file and check that all sample ids are represented:

In [31]:
plinkio = plinkfile.open(str(isheep_data_path / "600K-all/600K-all"))

Check that sample names in plink file and metadata are the same

In [32]:
samples = [sample.iid for sample in plinkio.get_samples()]
missing_samples = []
count = 0

for index, row in isheep_600K.iterrows():
    if not row["Sample ID"] in samples:
        # print(f"Missing {row['Sample ID']}")
        count += 1
        missing_samples.append(row["Sample ID"])
        
if count > 0:
    print(f"{count} samples name don't match between metadata and plink file")

121 samples name don't match between metadata and plink file


Let's print out first 10 sample names which don't have a correspondence in metadata table

In [33]:
missing_samples[:10]

['DLS DLS355',
 'DLS DLS.DLS249',
 'DLS DLS.DLS302',
 'DLS DLS.DLS305',
 'DLS DLS.DLS309',
 'DLS DLS.DLS311',
 'DLS DLS.DLS313',
 'DLS DLS.DLS316',
 'DLS DLS.DLS317',
 'DLS DLS.DLS325']

In [34]:
test = pd.DataFrame({"samples_id": isheep_600K["Sample ID"], "samples_vcf": samples})
id2vcf_file = isheep_data_path / "samples_id-vcf-600K.xlsx"
test.to_excel(str(id2vcf_file))

Try to merge this table with biosample information:

In [35]:
isheep_600K = pd.merge(isheep_600K, isheep_600K_biosample, left_on="Biosample ID", right_on="accession", how="inner")

Save only 600K sample in a different file

In [36]:
isheep_600K.to_excel(str(isheep_data_path / "isheep_600K.xlsx"))

## WGS dataset
finally, get WGS metadata:

In [37]:
isheep_WGS = metadata[metadata["Assay type"] == "WGS"].copy()
isheep_WGS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 355 entries, 0 to 354
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        355 non-null    object
 1   Biosample ID     355 non-null    object
 2   Bioproject ID    355 non-null    object
 3   Species          355 non-null    object
 4   Breed            324 non-null    object
 5   Sex              150 non-null    object
 6   Sample location  269 non-null    object
 7   Material         316 non-null    object
 8   Technology       355 non-null    object
 9   Assay type       355 non-null    object
 10  Coverage         355 non-null    object
 11  data resource    355 non-null    object
dtypes: object(12)
memory usage: 36.1+ KB


In [38]:
isheep_WGS.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


In [39]:
isheep_WGS = metadata[metadata["Assay type"] == "WGS"]
isheep_WGS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 355 entries, 0 to 354
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        355 non-null    object
 1   Biosample ID     355 non-null    object
 2   Bioproject ID    355 non-null    object
 3   Species          355 non-null    object
 4   Breed            324 non-null    object
 5   Sex              150 non-null    object
 6   Sample location  269 non-null    object
 7   Material         316 non-null    object
 8   Technology       355 non-null    object
 9   Assay type       355 non-null    object
 10  Coverage         355 non-null    object
 11  data resource    355 non-null    object
dtypes: object(12)
memory usage: 36.1+ KB


In [40]:
isheep_WGS.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,Male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


In [41]:
isheep_WGS["Species"].unique()

array(['Ovis aries', 'Ovis canadensis', 'Ovis dalli', 'Ovis ammon',
       'Ovis orientalis'], dtype=object)

In [42]:
isheep_WGS["Breed"].unique()

array(['Merino Horned', 'Merino Polled', nan, 'Katahdin', 'Tibetan',
       'Small Tailed Han', 'Bighorn', 'Afshari', 'Awassi',
       'African White Dorper', 'Turkish Awassi', 'Brazilian Creole',
       'Bangladeshi', 'Morada Nova', 'Santa Inês', 'Castellana',
       'Cine Capari', 'Changthangi', 'Churra', 'Cheviot',
       'Dollgellau Welsh Mountain', 'Ethiopian Menz', 'Finnsheep',
       'Indian Garole', 'Gulf Coast native', 'Garut', 'Karya', 'Karakas',
       'Meat Lacaune', 'Mmilk Lacaune', 'Merino', 'Norduz',
       'Namaqua Afrikaner', 'Ojalada', 'Poll Dorset',
       'Ronderib Afrikaner', 'Romney', 'Salz', 'Scottish Blackface',
       'Sakiz', 'Swiss Mirror', 'Sumatra', 'Swiss White Alpine', 'Texel',
       'Tregaon Welsh Mountain', 'Valais Blacknose',
       'Welsh Hardy Speckled Face', 'Northern Tibetan', 'Eastern Tibetan',
       'Ovis canadensis', 'Ovis dalli', 'North American Soay', 'Argali',
       'Altay', 'Baerchuke', 'Bayinbuluke', 'Diqing', 'Guide Black Fur',
       '

There are some breeds from WGS data that are unknown. Should I consider such animals? I hope I can find a proper breed from biosample

In [43]:
isheep_WGS[isheep_WGS["Breed"].isna()]

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
10,Camb0619497,ERS1171396,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,13.68,GVM
11,Camb10114673,ERS1171397,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,13.92,GVM
12,Camb12117423,ERS1171398,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,12.61,GVM
13,Camb976145,ERS1171399,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,12.77,GVM
14,Camb997502,ERS1171400,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,13.47,GVM
15,Camb997522,ERS1171401,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,12.87,GVM
16,Camb997550,ERS1171402,ERP015709,Ovis aries,,,,,Illumina HiSeq 2000,WGS,12.06,GVM
19,IROA-B2-5037,ERS154864,ERP001582,Ovis aries,,Female,"Southern Asia,Iran,ilbolaghi,ilbolaghi",,Illumina HiSeq 2000,WGS,14.01,GVM
20,IROA-B2-5296,ERS239046,ERP001582,Ovis aries,,Female,"Southern Asia,Iran,Maku,Maku",,Illumina HiSeq 2000,WGS,11.26,GVM
21,IROA-B3-5134,ERS239047,ERP001582,Ovis aries,,Male,"Southern Asia,Iran,khoiy,khoiy",,Illumina HiSeq 2000,WGS,12.31,GVM


Some breeds are missing also in the original dataset, for example the "*IROA\**" samples (which seems to come from nextgen dataset):

In [44]:
nextgenfile = str(isheep_data_path / "ovis.sample_index")
nextgen = pd.read_table(nextgenfile, skiprows=10)
nextgen.head()

Unnamed: 0,#sample_name,sample_accession,biosamples_id,sample_provider,species,taxonomy_id,breed,country,closest_city,closest_locality,estimated_age_months,sex,longitude,latitude,sampling_date,photographs
0,IROA-B2-5037,ERS154864,SAMEA2012928,NEXTGEN,Ovis aries,9940,.,Iran,ilbolaghi,ilbolaghi,60,female,44.928195,39.044641,2011-10-24,"IROA-B2-5037a.JPG,IROA-B2-5037c.JPG,IROA-B2-50..."
1,IROA-B2-5296,ERS239046,SAMEA2065588,NEXTGEN,Ovis aries,9940,.,Iran,Maku,Maku,36,female,44.929098,39.045764,2011-10-27,"IROA-B2-5296c.JPG,IROA-B2-5296b.JPG,IROA-B2-52..."
2,IROA-B3-5134,ERS239047,SAMEA2065589,NEXTGEN,Ovis aries,9940,.,Iran,khoiy,khoiy,24,male,44.93919,38.656644,2011-10-30,"IROA-B3-5134c.JPG,IROA-B3-5134a.JPG,IROA-B3-51..."
3,IROA-B4-5190,ERS154865,SAMEA2012929,NEXTGEN,Ovis aries,9940,.,Iran,salmas,salmas,36,female,44.838767,38.153121,2011-10-27,"IROA-B4-5190c.JPG,IROA-B4-5190a.JPG,IROA-B4-51..."
4,IROA-B5-5295,ERS154863,SAMEA2012927,NEXTGEN,Ovis aries,9940,.,Iran,urumie,urumie,24,male,44.952849,37.972674,2011-10-26,"IROA-B5-5295a.JPG,IROA-B5-5295c.JPG,IROA-B5-52..."


However, I could collect coordinates from this datafile. Try to collect data from biosample or load data from file. The point is that some of those record are in EBI biosample, so let's focus on *CNCB* first:

In [45]:
biosample_pkl = isheep_data_path / "isheep_WGS_cncb_biosample.pkl"

if biosample_pkl.exists():
    isheep_WGS_cncb_biosample = pd.read_pickle(biosample_pkl)
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_WGS["Biosample ID"])]
    isheep_WGS_cncb_biosample = pd.json_normalize(biosample_data)
    isheep_WGS_cncb_biosample.to_pickle(biosample_pkl)
    isheep_WGS_cncb_biosample.to_excel(str(isheep_data_path / "isheep_WGS_cncb_biosample.xlsx"))

In [46]:
isheep_WGS_cncb_biosample.head()

Unnamed: 0,message,sampleId,userId,accession,name,title,sampleAttribute.age,sampleAttribute.attributeId,sampleAttribute.biomaterialProvider,sampleAttribute.birthLocation,...,sampleAttribute.latitudeLongitude,sampleAttribute.sample.sampleId,sampleAttribute.sex,sampleAttribute.storageConditions,sampleAttribute.taxon.name,sampleAttribute.taxon.taxonId,sampleAttribute.tissue,sampletype.attributeTable,sampletype.sampleTypeId,sampletype.sampleTypeName
0,Nonentity,0,0,,,,,,,,...,,,,,,,,,,
1,Nonentity,0,0,,,,,,,,...,,,,,,,,,,
2,Nonentity,0,0,,,,,,,,...,,,,,,,,,,
3,Nonentity,0,0,,,,,,,,...,,,,,,,,,,
4,Nonentity,0,0,,,,,,,,...,,,,,,,,,,


Those record with `Nonentity` message are record I couldn't find into *CNCB*. Let's do a search into *EBI* biosample:

In [47]:
biosample_pkl = isheep_data_path / "isheep_WGS_ebi_biosample.pkl"

if biosample_pkl.exists():
    isheep_WGS_ebi_biosample = pd.read_pickle(biosample_pkl)
else:
    biosample_data = [fetch_ebi_biosample(biosample_id) for biosample_id in tqdm(isheep_WGS["Biosample ID"])]
    isheep_WGS_ebi_biosample = pd.json_normalize(biosample_data)
    isheep_WGS_ebi_biosample.to_pickle(biosample_pkl)
    isheep_WGS_ebi_biosample.to_excel(str(isheep_data_path / "isheep_WGS_ebi_biosample.xlsx"))

In [48]:
isheep_WGS_ebi_biosample.head()

Unnamed: 0,name,accession,domain,taxId,release,update,submitted,externalReferences,submittedVia,create,...,characteristics.estimated age,characteristics.geographic location (country and/or sea),characteristics.latitude,characteristics.longitude,characteristics.sampling date,characteristics.pool id,characteristics.biosamplemodel,characteristics.source of DNA,characteristics.breeding_history,characteristics.lat lon
0,AUMEHM000000000001,SAMN05302753,self.BiosampleImportNCBI,9940.0,2016-06-28T20:40:04.450Z,2022-03-29T19:09:00.092Z,2016-06-28T20:40:05.160Z,https://www.ebi.ac.uk/ena/data/view/SAMN05302753,JSON_API,2016-06-28T20:40:05.160Z,...,,,,,,,,,,
1,AUMEHM000000000002,SAMN05302754,self.BiosampleImportNCBI,9940.0,2016-06-28T20:40:04.450Z,2022-03-29T19:09:00.127Z,2016-06-28T20:40:05.483Z,https://www.ebi.ac.uk/ena/data/view/SAMN05302754,JSON_API,2016-06-28T20:40:05.483Z,...,,,,,,,,,,
2,AUMEHM000000000003,SAMN05302755,self.BiosampleImportNCBI,9940.0,2016-06-28T20:40:04.450Z,2022-03-29T19:09:00.127Z,2016-06-28T20:40:05.583Z,https://www.ebi.ac.uk/ena/data/view/SAMN05302755,JSON_API,2016-06-28T20:40:05.583Z,...,,,,,,,,,,
3,AUMEHM000000000004,SAMN05302756,self.BiosampleImportNCBI,9940.0,2016-06-28T20:40:04.450Z,2022-03-29T19:09:00.151Z,2016-06-28T20:40:06.063Z,https://www.ebi.ac.uk/ena/data/view/SAMN05302756,JSON_API,2016-06-28T20:40:06.063Z,...,,,,,,,,,,
4,AUMEPM000000000005,SAMN05302757,self.BiosampleImportNCBI,9940.0,2016-06-28T20:40:04.450Z,2022-03-29T19:09:00.165Z,2016-06-28T20:40:06.130Z,https://www.ebi.ac.uk/ena/data/view/SAMN05302757,JSON_API,2016-06-28T20:40:06.130Z,...,,,,,,,,,,


Let's search for bioproject (or at least, for the samples I can find):

In [49]:
bioproject_pkl = isheep_data_path / "isheep_WGS_bioproject.pkl"

if Path(bioproject_pkl).exists():
    isheep_WGS_bioproject = pd.read_pickle(bioproject_pkl)
else:
    bioproject_data = [fetch_cncb_bioproject(bioproject_id) for bioproject_id in tqdm(isheep_WGS["Bioproject ID"])]
    isheep_WGS_bioproject = pd.json_normalize(bioproject_data)
    isheep_WGS_bioproject.to_pickle(bioproject_pkl)
    isheep_WGS_bioproject.to_excel(str(isheep_data_path / "isheep_WGS_bioproject.xlsx"))

In [50]:
isheep_WGS_bioproject = isheep_WGS_bioproject.drop_duplicates(subset=["prjAccession"])
isheep_WGS_bioproject.head()

Unnamed: 0,message,prjAccession,biomaterialProvider,dataTypes,description,listConsortium,listDataProviders,listExternalLinks,listGrants,listPublication,...,submitter.firstName,submitter.lastName,submitter.middleName,submitter.organization,submitter.phone,submitter.postalCode,submitter.state,submitter.street,submitter.submitOrganizationUrl,submitter.submitterId
0,Nonentity,SRP078481,,,,,,,,,...,,,,,,,,,,
10,Nonentity,ERP015709,,,,,,,,,...,,,,,,,,,,
17,Nonentity,SRP031497,,,,,,,,,...,,,,,,,,,,
18,Nonentity,SRP096151,,,,,,,,,...,,,,,,,,,,
19,Nonentity,ERP001582,,,,,,,,,...,,,,,,,,,,


There are some samples which I can't find in cncb biosamples: cncb samples seem to have geographic coordinates in the same format of 600K.

In [51]:
isheep_WGS_cncb_biosample[isheep_WGS_cncb_biosample["message"] != "Nonentity"].head()

Unnamed: 0,message,sampleId,userId,accession,name,title,sampleAttribute.age,sampleAttribute.attributeId,sampleAttribute.biomaterialProvider,sampleAttribute.birthLocation,...,sampleAttribute.latitudeLongitude,sampleAttribute.sample.sampleId,sampleAttribute.sex,sampleAttribute.storageConditions,sampleAttribute.taxon.name,sampleAttribute.taxon.taxonId,sampleAttribute.tissue,sampletype.attributeTable,sampletype.sampleTypeId,sampletype.sampleTypeName
125,SUCCESS,59201,987,SAMC056212,ARG_0624D,we sequence this wild speices (Ovis ammon) ARG...,missing,8260.0,CAS Key Laboratory of Animal Ecology and Conse...,"Xinjiang, China",...,39.47 N 75.99 E,59201.0,1.0,preserved in 95% ethanol and stored at -80°C,Ovis ammon,30527.0,ear,sample_attr_model_animal,4.0,Model organism or animal sample
126,SUCCESS,59199,987,SAMC056210,ARG_6,we sequence this wild speices (Ovis ammon) ARG...,missing,8258.0,CAS Key Laboratory of Animal Ecology and Conse...,"Xinjiang, China",...,39.47 N 75.99 E,59199.0,1.0,preserved in 95% ethanol and stored at -80°C,Ovis ammon,30527.0,ear,sample_attr_model_animal,4.0,Model organism or animal sample
127,SUCCESS,59200,987,SAMC056211,ARG_8,we sequence this wild speices (Ovis ammon) ARG...,missing,8259.0,CAS Key Laboratory of Animal Ecology and Conse...,"Xinjiang, China",...,39.47 N 75.99 E,59200.0,1.0,preserved in 95% ethanol and stored at -80°C,Ovis ammon,30527.0,ear,sample_attr_model_animal,4.0,Model organism or animal sample
128,SUCCESS,59118,987,SAMC056129,ALS2,we sequence this Altay sheep ALS2 with whole-g...,missing,8177.0,CAS Key Laboratory of Animal Ecology and Conse...,"Aletai, Xinjiang",...,47.48 N 87.43 E,59118.0,6.0,preserved in 95% ethanol and stored at -80°C,Ovis aries,9940.0,missing,sample_attr_model_animal,4.0,Model organism or animal sample
129,SUCCESS,71025,987,SAMC067958,ALS285,sheep whole-genome resequencing,missing,11889.0,CAS Key Laboratory of Animal Ecology and Conse...,"Aletai, Xinjiang",...,,71025.0,2.0,,Ovis aries,9940.0,Ear,sample_attr_model_animal,4.0,Model organism or animal sample


Check that sample names in plink file and metadata are the same:

In [52]:
plinkio = plinkfile.open(str(isheep_data_path / "WGS-all/WGS-all.smarter"))

In [53]:
samples = [sample.iid for sample in plinkio.get_samples()]
count = 0

for index, row in isheep_WGS.iterrows():
    if not row["Sample ID"] in samples:
        # print(f"Missing {row['Sample ID']}")
        count += 1
        
if count > 0:
    print(f"{count} samples name don't match between metadata and plink file")
        

127 samples name don't match between metadata and plink file


In [54]:
test = pd.DataFrame({"samples_id": isheep_WGS["Sample ID"], "samples_vcf": samples})
test.to_excel(str(isheep_data_path / "samples_id-vcf-WGS.xlsx"))

IDs are different in metadata and datafiles. Save only WGS sample in a different file

In [55]:
isheep_WGS.to_excel(str(isheep_data_path / "isheep_WGS.xlsx"))

Try to merge this table with biosample information:

In [56]:
isheep_WGS = pd.merge(isheep_WGS, isheep_WGS_cncb_biosample, left_on="Biosample ID", right_on="accession", how="left")
isheep_WGS = pd.merge(isheep_WGS, isheep_WGS_ebi_biosample, left_on="Biosample ID", right_on="characteristics.SRA accession", how="left")

Save only 600K sample in a different file

In [57]:
isheep_WGS.to_excel(str(isheep_data_path / "isheep_WGS.xlsx"))