# ISHEEP dataset
Attempt to explore the ISHEEP dataset to understand if it can be imported into *SMARTER-database*

In [1]:
import re
import functools
import requests
from pathlib import Path

import pandas as pd
from plinkio import plinkfile
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim

from src.features.utils import get_project_dir

Define some useful functions:

In [2]:
geolocator = Nominatim(user_agent="SMARTER-database")


@functools.cache
def get_location(sample_location):
    location = geolocator.geocode(sample_location)
    
    if not location:
        return None, None, None
        
    return location.address, location.latitude, location.longitude


# https://stackoverflow.com/a/20558779
def stripNone(data):
    if isinstance(data, dict):
        return {k:stripNone(v) for k, v in data.items() if k is not None and v is not None}
    elif isinstance(data, list):
        return [stripNone(item) for item in data if item is not None]
    elif isinstance(data, tuple):
        return tuple(stripNone(item) for item in data if item is not None)
    elif isinstance(data, set):
        return {stripNone(item) for item in data if item is not None}
    else:
        return data

    
@functools.cache
def fetch_cncb_biosample(biosample_id):
    response = requests.get(f"https://ngdc.cncb.ac.cn/gwh/api/public/bioSample/{biosample_id}")
    
    if response.status_code != 200:
        return None
    
    data = response.json()
    data = stripNone(data)
    
    if "submitter" in data:
        _ = data.pop("submitter")
    
    if "taxon" in data:
        _ = data.pop("taxon")
    
    return data

Read all data from *ISHEEP dataset*

In [3]:
tablefile = str(get_project_dir() / "data/external/SHE/ISHEEP/Table_1_fixed.xlsx")
metadata = pd.read_excel(tablefile, sheet_name="Table S1", skiprows=1)

In [4]:
metadata.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


Those are the different data types present in the isheep dataset

In [5]:
metadata["Assay type"].unique()

array(['WGS', '50K chip', '600k chip'], dtype=object)

# 50K data
ok, let's subset the 50k dataset

In [6]:
isheep_50K = metadata[metadata["Assay type"] == "50K chip"].copy()

In [7]:
isheep_50K.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1512 entries, 355 to 1866
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        1512 non-null   object
 1   Biosample ID     1512 non-null   object
 2   Bioproject ID    1512 non-null   object
 3   Species          1512 non-null   object
 4   Breed            1512 non-null   object
 5   Sex              1512 non-null   object
 6   Sample location  1512 non-null   object
 7   Material         1512 non-null   object
 8   Technology       1512 non-null   object
 9   Assay type       1512 non-null   object
 10  Coverage         0 non-null      object
 11  data resource    1512 non-null   object
dtypes: object(12)
memory usage: 153.6+ KB


In [8]:
isheep_50K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
355,ARG10,SAMC060792,PRJCA001252,Ovis ammon,Argali,-,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
356,ARG11,SAMC060793,PRJCA001252,Ovis ammon,Argali,-,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
357,ARG9,SAMC060794,PRJCA001252,Ovis ammon,Argali,-,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
358,ARG6,SAMC060795,PRJCA001252,Ovis ammon,Argali,-,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"
359,ARG8,SAMC060796,PRJCA001252,Ovis ammon,Argali,-,"Xinjiang, China",Ear,"Illumina Ovine SNP50K (54,241 SNPs) BeadChip",50K chip,,"Zhao et al., 2017"


There are species different from *Ovis Aries* in dataset

In [9]:
isheep_50K["Species"].unique()

array(['Ovis ammon', 'Ovis orientalis', 'Ovis aries'], dtype=object)

Get the different *breeds* from dataset. Need to remove the *sheep* from name, test for my SMARTER breeds, and deal with spaces and odd values:

In [10]:
isheep_50K["Breed"].unique()

array(['Argali ', 'mouflon ', 'Baerchuke sheep', 'Bashbay sheep',
       'Celei black sheep', 'Diqing sheep', 'Guide Black Fur sheep',
       'Guangling fat-tail sheep', 'Hulun Buir sheep', 'Hetian sheep',
       'Hanzhong sheep', 'Jingzhong sheep', 'Kirghiz sheep',
       'Lanzhou Large-tailed sheep', 'Lop sheep',
       'Luzhong Mountain sheep', 'Minxian Black Fur sheep',
       'Ninglang Black sheep', 'Sunite sheep', 'Shiping Gray sheep',
       'Tan sheep', 'Tengchong sheep', 'Taihang Fur sheep',
       'Turfan Black sheep', 'Tong sheep', 'Tashkurgan sheep',
       'Lanping Black-bone sheep', 'Weining sheep', 'Wuranke sheep',
       'Ujimqin sheep', 'Yecheng sheep', 'Yuxi Fat-tailed sheep',
       'Zhaotong sheep', 'Altay sheep', 'Bayinbuluke sheep',
       'Tibetan sheep', 'Duolang sheep', 'Kazakh sheep'], dtype=object)

Try to collect data from CNCB biosample or load data from file

In [11]:
if Path("isheep_50K_biosample.pkl").exists():
    isheep_50K_biosample = pd.read_pickle("isheep_50K_biosample.pkl")
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_50K["Biosample ID"])]
    isheep_50K_biosample = pd.json_normalize(biosample_data)
    isheep_50K_biosample.to_pickle("isheep_50K_biosample.pkl")
    isheep_50K_biosample.to_excel("isheep_50K_biosample.xlsx")

Data collected from biosamples are not different from data collected from metadata. Try to open the processed plink file and check that all sample ids are represented

In [12]:
plinkio = plinkfile.open(str(get_project_dir() / "data/external/SHE/ISHEEP/50K-all/50K-all"))

In [13]:
samples = [sample.iid for sample in plinkio.get_samples()]
for index, row in isheep_50K.iterrows():
    if not row["Sample ID"] in samples:
        print(f"Missing {row['Sample ID']}")

Sample names between metadata and plink file are identical. Try to derive sample location using geocoding:

In [14]:
test = isheep_50K["Sample location"].apply(get_location)
# isheep_50K["address"] = test.apply(lambda x: x[0])
# isheep_50K["latitude"] = test.apply(lambda x: x[1])
# isheep_50K["longitude"] = test.apply(lambda x: x[2])
# isheep_50K.head()
isheep_50K.to_excel("isheep_50K.xlsx")

Despite is possible to derive coordinates from the location I have, I don't want to place a latitude and longitude in my database: if anyone want to consider coordinates from location, need to understand the coordinate precision level (I can't consider russia coordinate at the same level of a cina county)

## 600K data
next, explore the 600k dataset

In [15]:
isheep_600K = metadata[metadata["Assay type"] == "600k chip"]
isheep_600K.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 911 entries, 1867 to 2777
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        911 non-null    object
 1   Biosample ID     911 non-null    object
 2   Bioproject ID    911 non-null    object
 3   Species          911 non-null    object
 4   Breed            911 non-null    object
 5   Sex              911 non-null    object
 6   Sample location  911 non-null    object
 7   Material         911 non-null    object
 8   Technology       911 non-null    object
 9   Assay type       911 non-null    object
 10  Coverage         0 non-null      object
 11  data resource    911 non-null    object
dtypes: object(12)
memory usage: 92.5+ KB


In [16]:
isheep_600K.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
1867,DLS DLS355,SAMC062304,PRJCA001253,Ovis aries,Duolang sheep,female,"Aksu region of Xinjiang province, China",ear,Illumina Ovine Infinium HD SNP BeadChip,600k chip,,"Gao et al., 2018"
1868,DLS DLS.DLS249,SAMC062305,PRJCA001253,Ovis aries,Duolang sheep,female,"Aksu region of Xinjiang province, China",ear,Illumina Ovine Infinium HD SNP BeadChip,600k chip,,"Gao et al., 2018"
1869,DLS DLS.DLS302,SAMC062306,PRJCA001253,Ovis aries,Duolang sheep,male,"Aksu region of Xinjiang province, China",ear,Illumina Ovine Infinium HD SNP BeadChip,600k chip,,"Gao et al., 2018"
1870,DLS DLS.DLS305,SAMC062307,PRJCA001253,Ovis aries,Duolang sheep,male,"Aksu region of Xinjiang province, China",ear,Illumina Ovine Infinium HD SNP BeadChip,600k chip,,"Gao et al., 2018"
1871,DLS DLS.DLS309,SAMC062308,PRJCA001253,Ovis aries,Duolang sheep,male,"Aksu region of Xinjiang province, China",ear,Illumina Ovine Infinium HD SNP BeadChip,600k chip,,"Gao et al., 2018"


In [17]:
isheep_600K["Species"].unique()

array(['Ovis aries'], dtype=object)

In [18]:
isheep_600K["Breed"].unique()

array(['Duolang sheep', 'Finnsheep', 'Texel sheep',
       'Large Tailed Han sheep', 'Hu sheep', 'Wadi sheep',
       'Icelandic sheep', 'Romanov sheep', 'Sishui Fur sheep',
       'Small Tailed Han sheep'], dtype=object)

Try to collect data from CNCB biosample or load data from file

In [19]:
if Path("isheep_600K_biosample.pkl").exists():
    isheep_600K_biosample = pd.read_pickle("isheep_600K_biosample.pkl")
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_600K["Biosample ID"])]
    isheep_600K_biosample = pd.json_normalize(biosample_data)
    isheep_600K_biosample.to_pickle("isheep_600K_biosample.pkl")
    isheep_600K_biosample.to_excel("isheep_600K_biosample.xlsx")

This time I can find coordinates in Biosamples in NE format (ie *38.37 N 77.23 E*). The sample name is the same I found in datafile. Need to define a sample alias equal to the sample name I find in plinkfile. Try to open the processed plink file and check that all sample ids are represented

In [20]:
plinkio = plinkfile.open(str(get_project_dir() / "data/external/SHE/ISHEEP/600K-all/600K-all"))

Check that sample names in plink file and metadata are the same

In [21]:
samples = [sample.iid for sample in plinkio.get_samples()]
count = 0

for index, row in isheep_600K.iterrows():
    if not row["Sample ID"] in samples:
        # print(f"Missing {row['Sample ID']}")
        count += 1
        
if count > 0:
    print(f"{count} samples name don't match between metadata and plink file")

121 samples name don't match between metadata and plink file


In [22]:
test = pd.DataFrame({"samples_id": isheep_600K["Sample ID"], "samples_vcf": samples})
if not Path("samples_id-vcf-600K.xlsx").exists():
    test.to_excel("samples_id-vcf-600K.xlsx")

Save only 600K sample in a different file

In [23]:
isheep_600K.to_excel("isheep_600K.xlsx")

## WGS dataset
finally, get WGS metadata:

In [24]:
isheep_WGS = metadata[metadata["Assay type"] == "WGS"].copy()
isheep_WGS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 355 entries, 0 to 354
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        355 non-null    object
 1   Biosample ID     355 non-null    object
 2   Bioproject ID    355 non-null    object
 3   Species          355 non-null    object
 4   Breed            355 non-null    object
 5   Sex              355 non-null    object
 6   Sample location  355 non-null    object
 7   Material         355 non-null    object
 8   Technology       355 non-null    object
 9   Assay type       355 non-null    object
 10  Coverage         355 non-null    object
 11  data resource    355 non-null    object
dtypes: object(12)
memory usage: 36.1+ KB


In [25]:
isheep_WGS.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


In [26]:
isheep_WGS = metadata[metadata["Assay type"] == "WGS"]
isheep_WGS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 355 entries, 0 to 354
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sample ID        355 non-null    object
 1   Biosample ID     355 non-null    object
 2   Bioproject ID    355 non-null    object
 3   Species          355 non-null    object
 4   Breed            355 non-null    object
 5   Sex              355 non-null    object
 6   Sample location  355 non-null    object
 7   Material         355 non-null    object
 8   Technology       355 non-null    object
 9   Assay type       355 non-null    object
 10  Coverage         355 non-null    object
 11  data resource    355 non-null    object
dtypes: object(12)
memory usage: 36.1+ KB


In [27]:
isheep_WGS.head()

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
0,AUMEHM01,ERS1460923,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.95,GVM
1,AUMEHM02,ERS1460904,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.87,GVM
2,AUMEHM03,ERS1460978,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.42,GVM
3,AUMEHM04,ERS1461020,SRP078481,Ovis aries,Merino Horned,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,16.45,GVM
4,AUMEPM05,ERS1461055,SRP078481,Ovis aries,Merino Polled,male,"Oceania,Australia",Semen,Illumina HiSeq 2500,WGS,14.46,GVM


In [28]:
isheep_WGS["Species"].unique()

array(['Ovis aries', 'Ovis canadensis', 'Ovis dalli', 'Ovis ammon',
       'Ovis orientalis'], dtype=object)

In [29]:
isheep_WGS["Breed"].unique()

array(['Merino Horned', 'Merino Polled', ' -', 'Katahdin',
       'Tibetan sheep', 'Small tailed han sheep', 'Bighorn sheep',
       'Afshari', 'Awassi', 'African White Dorper', 'Turkish Awassi',
       'Brazilian Creole', 'Bangladeshi', 'Morada Nova', 'Santa Inês',
       'Castellana', 'Cine Capari', 'Changthangi', 'Churra', 'Cheviot',
       'Dollgellau Welsh Mountain', 'Ethiopian Menz', 'Finnsheep',
       'Indian Garole', 'Gulf Coast native', 'Garut', 'Karya', 'Karakas',
       'Meat Lacaune', 'Mmilk Lacaune', 'Merino', 'Norduz',
       'Namaqua Afrikaner', 'Ojalada', 'Poll Dorset',
       'Ronderib Afrikaner', 'Romney', 'Salz', 'Scottish Blackface',
       'Sakiz', 'Swiss Mirror', 'Sumatra', 'Swiss White Alpine', 'Texel',
       'Tregaon Welsh mountain', 'Valais Blacknose',
       'Welsh Hardy Speckled Face', 'Northern Tibetan', 'Eastern Tibetan',
       'Ovis canadensis', 'Ovis dalli', 'North American Soay', 'Argali ',
       'Altay sheep', 'Baerchuke sheep', 'Bayinbuluke sheep',

There are some breeds from WGS data that are unknown. Should I consider such animals?

In [30]:
isheep_WGS[isheep_WGS["Breed"] == " -"]

Unnamed: 0,Sample ID,Biosample ID,Bioproject ID,Species,Breed,Sex,Sample location,Material,Technology,Assay type,Coverage,data resource
10,Camb0619497,ERS1171396,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,13.68,GVM
11,Camb10114673,ERS1171397,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,13.92,GVM
12,Camb12117423,ERS1171398,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,12.61,GVM
13,Camb976145,ERS1171399,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,12.77,GVM
14,Camb997502,ERS1171400,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,13.47,GVM
15,Camb997522,ERS1171401,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,12.87,GVM
16,Camb997550,ERS1171402,ERP015709,Ovis aries,-,-,-,-,Illumina HiSeq 2000,WGS,12.06,GVM
19,IROA-B2-5037,ERS154864,ERP001582,Ovis aries,-,female,"Southern Asia,Iran,ilbolaghi,ilbolaghi",-,Illumina HiSeq 2000,WGS,14.01,GVM
20,IROA-B2-5296,ERS239046,ERP001582,Ovis aries,-,female,"Southern Asia,Iran,Maku,Maku",-,Illumina HiSeq 2000,WGS,11.26,GVM
21,IROA-B3-5134,ERS239047,ERP001582,Ovis aries,-,male,"Southern Asia,Iran,khoiy,khoiy",-,Illumina HiSeq 2000,WGS,12.31,GVM


Some breeds are missing also in the original dataset, for example the "*IROA\**" samples (which seems to come from nextgen dataset):

In [31]:
nextgenfile = str(get_project_dir() / "data/external/SHE/ISHEEP/ovis.sample_index")
nextgen = pd.read_table(nextgenfile, skiprows=10)
nextgen.head()

Unnamed: 0,#sample_name,sample_accession,biosamples_id,sample_provider,species,taxonomy_id,breed,country,closest_city,closest_locality,estimated_age_months,sex,longitude,latitude,sampling_date,photographs
0,IROA-B2-5037,ERS154864,SAMEA2012928,NEXTGEN,Ovis aries,9940,.,Iran,ilbolaghi,ilbolaghi,60,female,44.928195,39.044641,2011-10-24,"IROA-B2-5037a.JPG,IROA-B2-5037c.JPG,IROA-B2-50..."
1,IROA-B2-5296,ERS239046,SAMEA2065588,NEXTGEN,Ovis aries,9940,.,Iran,Maku,Maku,36,female,44.929098,39.045764,2011-10-27,"IROA-B2-5296c.JPG,IROA-B2-5296b.JPG,IROA-B2-52..."
2,IROA-B3-5134,ERS239047,SAMEA2065589,NEXTGEN,Ovis aries,9940,.,Iran,khoiy,khoiy,24,male,44.93919,38.656644,2011-10-30,"IROA-B3-5134c.JPG,IROA-B3-5134a.JPG,IROA-B3-51..."
3,IROA-B4-5190,ERS154865,SAMEA2012929,NEXTGEN,Ovis aries,9940,.,Iran,salmas,salmas,36,female,44.838767,38.153121,2011-10-27,"IROA-B4-5190c.JPG,IROA-B4-5190a.JPG,IROA-B4-51..."
4,IROA-B5-5295,ERS154863,SAMEA2012927,NEXTGEN,Ovis aries,9940,.,Iran,urumie,urumie,24,male,44.952849,37.972674,2011-10-26,"IROA-B5-5295a.JPG,IROA-B5-5295c.JPG,IROA-B5-52..."


However, I could collect coordinates from this datafile

Try to collect data from CNCB biosample or load data from file

In [32]:
if Path("isheep_WGS_biosample.pkl").exists():
    isheep_WGS_biosample = pd.read_pickle("isheep_WGS_biosample.pkl")
else:
    biosample_data = [fetch_cncb_biosample(biosample_id) for biosample_id in tqdm(isheep_WGS["Biosample ID"])]
    isheep_WGS_biosample = pd.json_normalize(biosample_data)
    isheep_WGS_biosample.to_pickle("isheep_WGS_biosample.pkl")
    isheep_WGS_biosample.to_excel("isheep_WGS_biosample.xlsx")

In [33]:
plinkio = plinkfile.open(str(get_project_dir() / "data/external/SHE/ISHEEP/WGS-all/WGS-all.smarter"))

There are some samples which I can't find in cncb biosamples: They are annotated using SRA accession and so I can't fetch a record for more than 100 samples. Other samples seems to have geographic coordinates in the same format of 600K. Check that sample names in plink file and metadata are the same

In [34]:
samples = [sample.iid for sample in plinkio.get_samples()]
count = 0

for index, row in isheep_WGS.iterrows():
    if not row["Sample ID"] in samples:
        # print(f"Missing {row['Sample ID']}")
        count += 1
        
if count > 0:
    print(f"{count} samples name don't match between metadata and plink file")
        

127 samples name don't match between metadata and plink file


In [35]:
test = pd.DataFrame({"samples_id": isheep_WGS["Sample ID"], "samples_vcf": samples})
if not Path("samples_id-vcf-WGS.xlsx").exists():
    test.to_excel("samples_id-vcf-WGS.xlsx")

IDs are different in metadata and datafiles. Save only WGS sample in a different file

In [36]:
isheep_WGS.to_excel("isheep_WGS.xlsx")