# Introduction

Match set of stam and wold biosamples for Brian.

In [1]:
import sys

In [2]:
!$sys.executable -m pip install --user encoded_client



In [3]:
import pandas
from pathlib import Path
from io import StringIO
import re
from tqdm import tqdm
from encoded_client.encoded import ENCODED

In [4]:
server = ENCODED("www.encodeproject.org")

In [5]:
book_name = Path("~/proj/encode4-curation/Snyder tissue correspondence Wold Stam August 4 2022.xlsx").expanduser()
woldlab_ids = pandas.read_excel(book_name, sheet_name="wold biosamples")
stamlab_ids = pandas.read_excel(book_name, sheet_name="stam biosamples")

Lets add some biosample metadata to to the tables.

We want the parent biosample accession, the term name, and the donor

In [6]:
def most_parent(biosample, cache):
    parent_biosample = biosample.get("part_of")
    if parent_biosample is None:
        return biosample
    else:
        accession = parent_biosample["accession"]
        biosample = cache.setdefault(accession, server.get_json(accession))
        return most_parent(biosample, cache)

cache = {}
bio = most_parent(server.get_json("ENCBS059BTM"), cache)
print(bio["accession"], cache.keys())

ENCBS368OMB dict_keys(['ENCBS060XFZ', 'ENCBS368OMB'])


In [7]:
def update_table(table, cache):
    parents = []
    term_names = []
    donors = []

    for i, row in tqdm(table.iterrows(), total=table.shape[0]):
        #print("Requesting {} {}".format(i, row["Biosample"]))
        biosample = server.get_json(row["Biosample"])
        #parent_biosample = biosample.get("part_of")
        #if parent_biosample is None:
        #    parent_biosample = {"accession": None}
        parent_biosample = most_parent(biosample, biosample_cache)
        parents.append(parent_biosample["accession"])
        term_names.append(biosample["biosample_ontology"]["term_name"])
        donors.append(biosample["donor"]["accession"])
        
    if "parent" not in table:    
        table["parent"] = parents            

    if "biosample_term_name" not in table:
        table["biosample_term_name"] = term_names
        
    if "donor" not in table:
        table["donor"] = donors
        

biosample_cache = {}
update_table(stamlab_ids, biosample_cache)
print(len(biosample_cache))
update_table(woldlab_ids, biosample_cache)
print(len(biosample_cache))

100%|█████████████████████████████████████████| 144/144 [01:42<00:00,  1.41it/s]


218


100%|███████████████████████████████████████████| 39/39 [00:31<00:00,  1.24it/s]

256





In [8]:
common_parents = set(woldlab_ids.parent).intersection(stamlab_ids.parent)
print("{}/{} samples matched by parent biosample".format(len(common_parents), woldlab_ids.shape[0]))

20/39 samples matched by parent biosample


In [9]:
different_parents = set(woldlab_ids.parent).difference(stamlab_ids.parent)
print("{}/{} samples have different parent biosamples".format(len(different_parents), woldlab_ids.shape[0]))

19/39 samples have different parent biosamples


In [10]:
common_donors = set(woldlab_ids.donor).intersection(stamlab_ids.donor)
print("{}/{} wold donors in found in stamlab donors".format(len(common_donors), len(set(woldlab_ids.donor))))

15/16 wold donors in found in stamlab donors


In [11]:
woldlab_ids[woldlab_ids.donor.isin(common_donors) == False]

Unnamed: 0,Wold bucket delivery date,Biosample,Wold Sample,Wold Donor Accession,Wold ENCBS - parent biosample,parent,biosample_term_name,donor
5,2 May 9 2019,ENCBS387QUR,UW065 heart right ventricle,ENCDO804CLC,,ENCBS209BNQ,heart right ventricle,ENCDO804CLC
12,2 May 9 2019,ENCBS666SUK,UW065 heart left ventricle,ENCDO804CLC,,ENCBS924KUU,heart left ventricle,ENCDO804CLC


In [12]:
common_term_names = set(stamlab_ids["biosample_term_name"]).intersection(woldlab_ids["biosample_term_name"])
print("{}/{} wold biosample terms found in stamlab terms".format(len(common_term_names), len(set(woldlab_ids.biosample_term_name))))


14/19 wold biosample terms found in stamlab terms


In [13]:
matching_names_different_parents = woldlab_ids[woldlab_ids.parent.isin(different_parents) & woldlab_ids.biosample_term_name.isin(common_term_names)]
print(matching_names_different_parents.shape[0])
matching_names_different_parents

14


Unnamed: 0,Wold bucket delivery date,Biosample,Wold Sample,Wold Donor Accession,Wold ENCBS - parent biosample,parent,biosample_term_name,donor
5,2 May 9 2019,ENCBS387QUR,UW065 heart right ventricle,ENCDO804CLC,,ENCBS209BNQ,heart right ventricle,ENCDO804CLC
8,2 May 9 2019,ENCBS525FTQ,UW067 heart left ventricle,ENCDO926KEV,,ENCBS511IXC,heart left ventricle,ENCDO926KEV
12,2 May 9 2019,ENCBS666SUK,UW065 heart left ventricle,ENCDO804CLC,,ENCBS924KUU,heart left ventricle,ENCDO804CLC
14,2 May 9 2019,ENCBS891CSJ,UW038 heart left ventricle,ENCDO439CAZ,,ENCBS034WHN,heart left ventricle,ENCDO439CAZ
15,2 May 9 2019,ENCBS895SZP,UW036 heart left ventricle,ENCDO808ASZ,,ENCBS856UDQ,heart left ventricle,ENCDO808ASZ
20,3 April 28 2021,ENCBS154IHO,W61 heart left ventricle,ENCDO575WHY,,ENCBS303LEQ,heart left ventricle,ENCDO575WHY
21,3 April 28 2021,ENCBS173PPT,W71 heart right ventricle,ENCDO528BHB,,ENCBS190DRR,heart right ventricle,ENCDO528BHB
23,3 April 28 2021,ENCBS352PQZ,W61 heart right ventricle,ENCDO575WHY,,ENCBS577DQE,heart right ventricle,ENCDO575WHY
24,3 April 28 2021,ENCBS379ZMT,W62 aorta,ENCDO575EGL,,ENCBS064HXH,aorta,ENCDO575EGL
30,3 April 28 2021,ENCBS739VWA,W71 psoas muscle,ENCDO528BHB,,ENCBS166ZJP,psoas muscle,ENCDO528BHB


In [14]:
stamlab_ids[stamlab_ids["biosample_term_name"] == "heart right ventricle"][["Biosample", "parent", "donor", "ID", "biosample_term_name"]]


Unnamed: 0,Biosample,parent,donor,ID,biosample_term_name
29,ENCBS059BTM,ENCBS368OMB,ENCDO808ASZ,/experiments/ENCSR694LFE/,heart right ventricle
47,ENCBS216RZX,ENCBS125SYU,ENCDO411EVD,/experiments/ENCSR356RNZ/,heart right ventricle
78,ENCBS424DYU,ENCBS350NZF,ENCDO477WED,/experiments/ENCSR214NLQ/,heart right ventricle
80,ENCBS516UIT,ENCBS710RTI,ENCDO926KEV,/experiments/ENCSR928WMU/,heart right ventricle
85,ENCBS846XLB,ENCBS090VFP,ENCDO967KID,/experiments/ENCSR298OIK/,heart right ventricle
88,ENCBS696YMT,ENCBS259FVD,ENCDO907YUG,/experiments/ENCSR246VSO/,heart right ventricle
97,ENCBS984JWU,ENCBS444SPE,ENCDO039RUH,/experiments/ENCSR792QQE/,heart right ventricle
104,ENCBS211PHF,ENCBS339COK,ENCDO392CRK,/experiments/ENCSR238FMP/,heart right ventricle
132,ENCBS867JKL,ENCBS459AAY,ENCDO439CAZ,/experiments/ENCSR211SUT/,heart right ventricle


In [23]:
stam_cols = ["Biosample", "Accession", "parent", "donor", "biosample_term_name", "Biosample summary"]

In [29]:
weak_stam_match = stamlab_ids[stamlab_ids.donor.isin(common_donors) & stamlab_ids.biosample_term_name.isin(common_term_names)][stam_cols]
weak_stam_match

Unnamed: 0,Biosample,Accession,parent,donor,biosample_term_name,Biosample summary
8,ENCBS173BRC,ENCSR138UGH,ENCBS597LVB,ENCDO186XRB,ovary,Homo sapiens ovary tissue female adult (61 years)
11,ENCBS923PBY,ENCSR713KNQ,ENCBS251BGN,ENCDO575WHY,aorta,Homo sapiens aorta tissue female adult (41 years)
13,ENCBS494JWP,ENCSR872DUY,ENCBS858UHJ,ENCDO575WHY,ovary,Homo sapiens ovary tissue female adult (41 years)
14,ENCBS450CEG,ENCSR541JMK,ENCBS948GDT,ENCDO528BHB,adrenal gland,Homo sapiens adrenal gland tissue female adult...
23,ENCBS664ALT,ENCSR933GMM,ENCBS790AKV,ENCDO575WHY,right lobe of liver,Homo sapiens right lobe of liver tissue female...
24,ENCBS651EAK,ENCSR330ZBO,ENCBS367RZG,ENCDO856ZOJ,ovary,Homo sapiens ovary tissue female adult (59 years)
27,ENCBS299VJH,ENCSR108SYM,ENCBS916EKQ,ENCDO186XRB,psoas muscle,Homo sapiens psoas muscle tissue female adult ...
29,ENCBS059BTM,ENCSR694LFE,ENCBS368OMB,ENCDO808ASZ,heart right ventricle,Homo sapiens heart right ventricle tissue male...
33,ENCBS544GDO,ENCSR404TSP,ENCBS599UJX,ENCDO186XRB,mucosa of descending colon,Homo sapiens mucosa of descending colon tissue...
37,ENCBS069DBW,ENCSR612CTW,ENCBS577YUV,ENCDO392CRK,mucosa of descending colon,Homo sapiens mucosa of descending colon tissue...


In [26]:
wold_cols = ["Biosample", "parent", "donor", "biosample_term_name"]

In [28]:
weak_wold_match = woldlab_ids[woldlab_ids.donor.isin(common_donors) & woldlab_ids.biosample_term_name.isin(common_term_names)][wold_cols]
weak_wold_match

Unnamed: 0,Biosample,parent,donor,biosample_term_name
0,ENCBS028WAL,ENCBS733JKT,ENCDO221IPH,pancreas
1,ENCBS046OBK,ENCBS350NZF,ENCDO477WED,heart right ventricle
2,ENCBS189LQW,ENCBS259FVD,ENCDO907YUG,heart right ventricle
3,ENCBS199BOZ,ENCBS427KOM,ENCDO477WED,heart left ventricle
4,ENCBS230VLO,ENCBS220WJT,ENCDO967KID,heart left ventricle
6,ENCBS397EAK,ENCBS459AAY,ENCDO439CAZ,heart right ventricle
7,ENCBS488WUX,ENCBS090VFP,ENCDO967KID,heart right ventricle
8,ENCBS525FTQ,ENCBS511IXC,ENCDO926KEV,heart left ventricle
9,ENCBS531TYS,ENCBS558KZM,ENCDO907YUG,heart left ventricle
10,ENCBS605FHL,ENCBS368OMB,ENCDO808ASZ,heart right ventricle


In [35]:
weak_merged = weak_wold_match.merge(weak_stam_match, left_on=["donor", "biosample_term_name"], right_on=["donor", "biosample_term_name"], suffixes=("_wold", "_stam"))

print(weak_merged.shape)
weak_merged

(20, 8)


Unnamed: 0,Biosample_wold,parent_wold,donor,biosample_term_name,Biosample_stam,Accession,parent_stam,Biosample summary
0,ENCBS028WAL,ENCBS733JKT,ENCDO221IPH,pancreas,ENCBS028WAL,ENCSR056PFI,ENCBS733JKT,Homo sapiens pancreas tissue male adult (26 ye...
1,ENCBS046OBK,ENCBS350NZF,ENCDO477WED,heart right ventricle,ENCBS424DYU,ENCSR214NLQ,ENCBS350NZF,Homo sapiens heart right ventricle tissue male...
2,ENCBS189LQW,ENCBS259FVD,ENCDO907YUG,heart right ventricle,ENCBS696YMT,ENCSR246VSO,ENCBS259FVD,Homo sapiens heart right ventricle tissue fema...
3,ENCBS199BOZ,ENCBS427KOM,ENCDO477WED,heart left ventricle,ENCBS030BAP,ENCSR222CLC,ENCBS427KOM,Homo sapiens heart left ventricle tissue male ...
4,ENCBS230VLO,ENCBS220WJT,ENCDO967KID,heart left ventricle,ENCBS880PTC,ENCSR299QGI,ENCBS220WJT,Homo sapiens heart left ventricle tissue male ...
5,ENCBS397EAK,ENCBS459AAY,ENCDO439CAZ,heart right ventricle,ENCBS867JKL,ENCSR211SUT,ENCBS459AAY,Homo sapiens heart right ventricle tissue male...
6,ENCBS488WUX,ENCBS090VFP,ENCDO967KID,heart right ventricle,ENCBS846XLB,ENCSR298OIK,ENCBS090VFP,Homo sapiens heart right ventricle tissue male...
7,ENCBS531TYS,ENCBS558KZM,ENCDO907YUG,heart left ventricle,ENCBS621PCO,ENCSR805EGZ,ENCBS558KZM,Homo sapiens heart left ventricle tissue femal...
8,ENCBS605FHL,ENCBS368OMB,ENCDO808ASZ,heart right ventricle,ENCBS059BTM,ENCSR694LFE,ENCBS368OMB,Homo sapiens heart right ventricle tissue male...
9,ENCBS655VDJ,ENCBS710RTI,ENCDO926KEV,heart right ventricle,ENCBS516UIT,ENCSR928WMU,ENCBS710RTI,Homo sapiens heart right ventricle tissue male...


In [42]:
strong_merged = woldlab_ids.merge(stamlab_ids, left_on=["parent", "biosample_term_name", "donor"], right_on=["parent", "biosample_term_name", "donor"], suffixes=("_wold", "_stam"))
print(strong_merged.shape[0])
strong_merged


20


Unnamed: 0,Wold bucket delivery date,Biosample_wold,Wold Sample,Wold Donor Accession,Wold ENCBS - parent biosample,parent,biosample_term_name,donor,Biosample_stam,ID,...,Project,Status,Files,Related series,Biological replicate,Technical replicate,Organism,Life stage,Biosample age,Replicates
0,2 May 9 2019,ENCBS028WAL,W64 pancreas,ENCDO221IPH,ENCBS733JKT,ENCBS733JKT,pancreas,ENCDO221IPH,ENCBS028WAL,/experiments/ENCSR056PFI/,...,ENCODE,released,"/files/ENCFF859WBE/,/files/ENCFF907SYU/,/files...",,1,1,Homo sapiens,adult,26 years,/replicates/013a846e-4f6a-42b1-a174-a916927003de/
1,2 May 9 2019,ENCBS046OBK,UW040 heart right ventricle,ENCDO477WED,,ENCBS350NZF,heart right ventricle,ENCDO477WED,ENCBS424DYU,/experiments/ENCSR214NLQ/,...,ENCODE,released,"/files/ENCFF860REB/,/files/ENCFF854KSW/,/files...",,1,1,Homo sapiens,adult,69 years,/replicates/10142694-1dfd-47ea-b6fd-80030aa27761/
2,2 May 9 2019,ENCBS189LQW,UW068 heart right ventricle,ENCDO907YUG,,ENCBS259FVD,heart right ventricle,ENCDO907YUG,ENCBS696YMT,/experiments/ENCSR246VSO/,...,ENCODE,released,"/files/ENCFF275ZTU/,/files/ENCFF250LXH/,/files...",,1,1,Homo sapiens,adult,56 years,/replicates/cde1609c-7666-44b4-a3b0-ed9261b36c65/
3,2 May 9 2019,ENCBS199BOZ,UW040 heart left ventricle,ENCDO477WED,,ENCBS427KOM,heart left ventricle,ENCDO477WED,ENCBS030BAP,/experiments/ENCSR222CLC/,...,ENCODE,released,"/files/ENCFF221BAW/,/files/ENCFF906WOJ/,/files...",,1,1,Homo sapiens,adult,69 years,/replicates/5aa71bb7-e8f4-4199-a362-1ffa9210249e/
4,2 May 9 2019,ENCBS230VLO,UW076 heart left ventricle,ENCDO967KID,,ENCBS220WJT,heart left ventricle,ENCDO967KID,ENCBS880PTC,/experiments/ENCSR299QGI/,...,ENCODE,released,"/files/ENCFF075GVG/,/files/ENCFF934ABI/,/files...",,1,1,Homo sapiens,adult,43 years,/replicates/9a01c8cf-4bb5-4a96-a85d-aefdb0127928/
5,2 May 9 2019,ENCBS397EAK,UW038 heart right ventricle,ENCDO439CAZ,,ENCBS459AAY,heart right ventricle,ENCDO439CAZ,ENCBS867JKL,/experiments/ENCSR211SUT/,...,ENCODE,released,"/files/ENCFF474TSR/,/files/ENCFF186NOO/,/files...",,1,1,Homo sapiens,adult,55 years,/replicates/ae178734-8334-4b09-865d-2b84d6ae0a6f/
6,2 May 9 2019,ENCBS488WUX,UW076 heart right ventricle,ENCDO967KID,,ENCBS090VFP,heart right ventricle,ENCDO967KID,ENCBS846XLB,/experiments/ENCSR298OIK/,...,ENCODE,released,"/files/ENCFF287TWO/,/files/ENCFF951NBT/,/files...",,1,1,Homo sapiens,adult,43 years,/replicates/6aecd808-d229-4ab4-9dfd-1de4859d0889/
7,2 May 9 2019,ENCBS531TYS,UW068 heart left ventricle,ENCDO907YUG,,ENCBS558KZM,heart left ventricle,ENCDO907YUG,ENCBS621PCO,/experiments/ENCSR805EGZ/,...,ENCODE,released,"/files/ENCFF978HUW/,/files/ENCFF649BHT/,/files...",,1,1,Homo sapiens,adult,56 years,/replicates/8fe88ec7-8291-4fce-8a06-764f95784a94/
8,2 May 9 2019,ENCBS605FHL,UW036 heart right ventricle,ENCDO808ASZ,,ENCBS368OMB,heart right ventricle,ENCDO808ASZ,ENCBS059BTM,/experiments/ENCSR694LFE/,...,ENCODE,released,"/files/ENCFF627FIO/,/files/ENCFF172GAK/,/files...",,1,1,Homo sapiens,adult,61 years,/replicates/74a5dbc1-4645-41a5-9570-b2af6cfec0ab/
9,2 May 9 2019,ENCBS655VDJ,UW067 heart right ventricle,ENCDO926KEV,,ENCBS710RTI,heart right ventricle,ENCDO926KEV,ENCBS516UIT,/experiments/ENCSR928WMU/,...,ENCODE,released,"/files/ENCFF401UVQ/,/files/ENCFF921IVK/,/files...",,1,1,Homo sapiens,adult,66 years,/replicates/4e178a94-33c6-4eb0-8342-10ff16930d98/


In [55]:
set(strong_merged["Biosample_wold"]) == set(weak_merged["Biosample_wold"])

True

In [52]:
brian_useful = strong_merged[["Wold bucket delivery date", "Biosample_wold", "Wold Sample", "parent", "donor", "biosample_term_name", "Biosample_stam", "Accession"]].rename(
    {"Accession": "Experiment_stam"}, axis=1)
brian_useful


Unnamed: 0,Wold bucket delivery date,Biosample_wold,Wold Sample,parent,donor,biosample_term_name,Biosample_stam,Experiment_stam
0,2 May 9 2019,ENCBS028WAL,W64 pancreas,ENCBS733JKT,ENCDO221IPH,pancreas,ENCBS028WAL,ENCSR056PFI
1,2 May 9 2019,ENCBS046OBK,UW040 heart right ventricle,ENCBS350NZF,ENCDO477WED,heart right ventricle,ENCBS424DYU,ENCSR214NLQ
2,2 May 9 2019,ENCBS189LQW,UW068 heart right ventricle,ENCBS259FVD,ENCDO907YUG,heart right ventricle,ENCBS696YMT,ENCSR246VSO
3,2 May 9 2019,ENCBS199BOZ,UW040 heart left ventricle,ENCBS427KOM,ENCDO477WED,heart left ventricle,ENCBS030BAP,ENCSR222CLC
4,2 May 9 2019,ENCBS230VLO,UW076 heart left ventricle,ENCBS220WJT,ENCDO967KID,heart left ventricle,ENCBS880PTC,ENCSR299QGI
5,2 May 9 2019,ENCBS397EAK,UW038 heart right ventricle,ENCBS459AAY,ENCDO439CAZ,heart right ventricle,ENCBS867JKL,ENCSR211SUT
6,2 May 9 2019,ENCBS488WUX,UW076 heart right ventricle,ENCBS090VFP,ENCDO967KID,heart right ventricle,ENCBS846XLB,ENCSR298OIK
7,2 May 9 2019,ENCBS531TYS,UW068 heart left ventricle,ENCBS558KZM,ENCDO907YUG,heart left ventricle,ENCBS621PCO,ENCSR805EGZ
8,2 May 9 2019,ENCBS605FHL,UW036 heart right ventricle,ENCBS368OMB,ENCDO808ASZ,heart right ventricle,ENCBS059BTM,ENCSR694LFE
9,2 May 9 2019,ENCBS655VDJ,UW067 heart right ventricle,ENCBS710RTI,ENCDO926KEV,heart right ventricle,ENCBS516UIT,ENCSR928WMU


In [53]:
brian_useful.to_csv("Snyder tissue correspondence.csv", index=False)