# Introduction

Brian gave me an excel file with a bunch of biosamples.

In [1]:
import pandas
import sys
from pathlib import Path

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
from encoded_client.encoded import ENCODED

In [3]:
server = ENCODED("www.encodeproject.org")

In [4]:
mouse_biosamples = Path("~/woldlab/ENCODE/mouse-biosamples/").expanduser()
book_filename = mouse_biosamples / "BioSample_Accession_replacements_for_pooled_samples_Jan12_2022.xlsx"

In [5]:
book = pandas.ExcelFile(book_filename)
print(book.sheet_names)

['Bernstein 2mo Aug31_2021', 'Bernstein 2mo gasthrtlctxSept15', 'PND14 Bernst Aug31_2021', 'Snyder_July22_21_2mos', 'Snyder PND14 Aug31_21', 'Stam 8mo B6Cast_Oct4_21', 'reaccession Rush brain', 'sex corrections']


In [6]:
sheets_to_pool_long = [
    'Bernstein 2mo Aug31_2021',
    'Bernstein 2mo gasthrtlctxSept15',
    'PND14 Bernst Aug31_2021',
    'Snyder_July22_21_2mos',
    'Snyder PND14 Aug31_21'
]

In [7]:
pools = {}
for name in sheets_to_pool_long:
    sheet = book.parse(name)
    assert sheet.shape[0] < 100
    assert sheet.shape[1] < 100
    for i, row in sheet.iterrows():
        pool_name = row["pool"]
        if not "not pooled" in pool_name:
            #print(row["accession"], pool_name, "'{}'".format(name))
            pools.setdefault(pool_name, []).append(row["accession"])

In [8]:
pools.keys()

dict_keys(['Bernstein pool A', 'Bernstein pool B', 'Bernstein pool C', 'Bernstein pool D', 'Bernstein pool E', 'Bernstein pool F', 'Bernstein pool G', 'Bernstein pool H', 'Bernstein pool I', 'Bernstein pool J', 'Bernstein pool K', 'Bernstein pool L', 'Bernstein pool M', 'Bernstein pool N', 'Bernstein pool O', 'Bernstein pool P', 'Bernstein pool Q', 'Bernstein pool R', 'Snyder pool A', 'Snyder pool B', 'Snyder pool C', 'Snyder pool D', 'Snyder pool E', 'Snyder pool F', 'Snyder pool G', 'Snyder pool H', 'Snyder pool I', 'Snyder pool J', 'Snyder pool K', 'Snyder pool L'])

In [9]:
sheets_flattened = ['Stam 8mo B6Cast_Oct4_21',]

for name in sheets_flattened:
    sheet = book.parse(name)
    assert sheet.shape[0] < 100
    assert sheet.shape[1] < 100
    for i, row in sheet.iterrows():
        pool_name = row["pool"]
        if not "not pooled" in pool_name:
            print(row["accession"], pool_name, name)
            for accession in row["accession"].split(","):
                accession = accession.strip()
                pools.setdefault(pool_name, []).append(accession)

ENCBS713HAN, ENCBS923AXP, ENCBS128HYJ, ENCBS286JPU, ENCBS102NBG, ENCBS540KST Stam pool A Stam 8mo B6Cast_Oct4_21
ENCBS596XGN, ENCBS288AEF Stam pool B Stam 8mo B6Cast_Oct4_21
ENCBS672UMT, ENCBS152AZC Stam pool C Stam 8mo B6Cast_Oct4_21
ENCBS250BWL, ENCBS214YGX, ENCBS329DCG, ENCBS040VYS, ENCBS246CMO, ENCBS728BPZ Stam pool D Stam 8mo B6Cast_Oct4_21
ENCBS694OJO, ENCBS325UXQ Stam pool E Stam 8mo B6Cast_Oct4_21
ENCBS801HVK, ENCBS798UHZ Stam pool F Stam 8mo B6Cast_Oct4_21
ENCBS677VID, ENCBS987JNY, ENCBS645LTV, ENCBS501MPV, ENCBS560FHS, ENCBS715NNB Stam pool G Stam 8mo B6Cast_Oct4_21
ENCBS364QXD, ENCBS980CMQ Stam pool H Stam 8mo B6Cast_Oct4_21
ENCBS359HFP, ENCBS257FVH Stam pool I Stam 8mo B6Cast_Oct4_21
ENCBS485RLO, ENCBS985IWD, ENCBS325GDF, ENCBS966NLI, ENCBS236ZJE, ENCBS684CFC Stam pool J Stam 8mo B6Cast_Oct4_21
ENCBS255CHT, ENCBS183YKL Stam pool K Stam 8mo B6Cast_Oct4_21
ENCBS417GKZ, ENCBS504DDU Stam pool L Stam 8mo B6Cast_Oct4_21


In [10]:
pooled_metadata = []
for pool_name in pools:
    first_biosample = server.get_json(pools[pool_name][0])
    starting_amount = float(first_biosample["starting_amount"])
    for accession in pools[pool_name][1:]:
        biosample = server.get_json(accession)
        assert first_biosample["biosample_ontology"]["@id"] == biosample["biosample_ontology"]["@id"]
        assert first_biosample["model_organism_age"] == biosample["model_organism_age"]
        assert first_biosample["model_organism_age_units"] == biosample["model_organism_age_units"]
        assert first_biosample["model_organism_sex"] == biosample["model_organism_sex"]
        assert first_biosample["starting_amount_units"] == biosample["starting_amount_units"]
        assert first_biosample["organism"] == biosample["organism"]
        assert first_biosample["donor"] == biosample["donor"]
        assert first_biosample["source"] == biosample["source"]
        #assert first_biosample["date_obtained"] == biosample["date_obtained"]
        starting_amount += float(biosample["starting_amount"])
    pooled_metadata.append({
        "uuid": None,
        "accession": None,
        "biosample_ontology": first_biosample["biosample_ontology"]["@id"],
        "description": "{} of {} {} biosamples".format(pool_name, len(pools[pool_name]), first_biosample["biosample_ontology"]["term_name"]),
        "model_organism_age": first_biosample["model_organism_age"],
        "model_organism_age_units": first_biosample["model_organism_age_units"],
        "model_organism_sex": first_biosample["model_organism_sex"],
        "starting_amount:number": "{:g}".format(starting_amount),
        "starting_amount_units": first_biosample["starting_amount_units"],
        "organism": first_biosample["organism"]["@id"],
        "donor": first_biosample["donor"]["@id"],
        "source": first_biosample["source"]["@id"],
        "pooled_from:array": ",".join(pools[pool_name]),
        "lab": "/labs/barbara-wold/",
        "award": "UM1HG009443"
    })
    print(pool_name, ",".join(pools[pool_name]))

Bernstein pool A ENCBS675DZY,ENCBS479TPY,ENCBS478GKF,ENCBS817QTL,ENCBS579UFD,ENCBS790OBW,ENCBS399UGJ,ENCBS518RSF,ENCBS517ZBU,ENCBS759FCF,ENCBS731DAE
Bernstein pool B ENCBS783CWJ,ENCBS225XSK,ENCBS016JOU,ENCBS351CWK,ENCBS258NPS,ENCBS009NGR,ENCBS086MRV,ENCBS597ONP,ENCBS114DTD,ENCBS306YXN,ENCBS548RHR
Bernstein pool C ENCBS389YDP,ENCBS153NQL,ENCBS157YXR,ENCBS232DDN,ENCBS955QYO,ENCBS238THT,ENCBS622KLI
Bernstein pool D ENCBS818DRF,ENCBS760ORH,ENCBS758BYF,ENCBS520AUC,ENCBS509XWN,ENCBS886RXP,ENCBS019ODD
Bernstein pool E ENCBS529HXA,ENCBS250NYR
Bernstein pool F ENCBS099JCP,ENCBS622LWQ
Bernstein pool G ENCBS957OTW,ENCBS757ZZT
Bernstein pool H ENCBS396JJT,ENCBS170UTC
Bernstein pool I ENCBS653COX,ENCBS075IUK,ENCBS627LAM,ENCBS622XYF
Bernstein pool J ENCBS670RYX,ENCBS715GXX,ENCBS685IAU,ENCBS159LYL
Bernstein pool K ENCBS570FGG,ENCBS407IAX
Bernstein pool L ENCBS696LCJ,ENCBS953SDI
Bernstein pool M ENCBS476MRT,ENCBS091DDP
Bernstein pool N ENCBS008GFW,ENCBS743INW
Bernstein pool O ENCBS075KUA,ENCBS476UVL,E

In [11]:
pooled_metadata = pandas.DataFrame(pooled_metadata)
pooled_metadata

Unnamed: 0,uuid,accession,biosample_ontology,description,model_organism_age,model_organism_age_units,model_organism_sex,starting_amount:number,starting_amount_units,organism,donor,source,pooled_from:array,lab,award
0,,,/biosample-types/tissue_UBERON_0002369/,Bernstein pool A of 11 adrenal gland biosamples,2,month,male,132,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS675DZY,ENCBS479TPY,ENCBS478GKF,ENCBS817QT...",/labs/barbara-wold/,UM1HG009443
1,,,/biosample-types/tissue_UBERON_0002369/,Bernstein pool B of 11 adrenal gland biosamples,2,month,male,132,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS783CWJ,ENCBS225XSK,ENCBS016JOU,ENCBS351CW...",/labs/barbara-wold/,UM1HG009443
2,,,/biosample-types/tissue_UBERON_0002305/,Bernstein pool C of 7 layer of hippocampus bio...,2,month,male,168,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS389YDP,ENCBS153NQL,ENCBS157YXR,ENCBS232DD...",/labs/barbara-wold/,UM1HG009443
3,,,/biosample-types/tissue_UBERON_0002305/,Bernstein pool D of 7 layer of hippocampus bio...,2,month,male,168,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS818DRF,ENCBS760ORH,ENCBS758BYF,ENCBS520AU...",/labs/barbara-wold/,UM1HG009443
4,,,/biosample-types/tissue_UBERON_0001388/,Bernstein pool E of 2 gastrocnemius biosamples,2,month,female,322,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS529HXA,ENCBS250NYR",/labs/barbara-wold/,UM1HG009443
5,,,/biosample-types/tissue_UBERON_0001388/,Bernstein pool F of 2 gastrocnemius biosamples,2,month,female,322,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS099JCP,ENCBS622LWQ",/labs/barbara-wold/,UM1HG009443
6,,,/biosample-types/tissue_UBERON_0000948/,Bernstein pool G of 2 heart biosamples,2,month,female,340,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS957OTW,ENCBS757ZZT",/labs/barbara-wold/,UM1HG009443
7,,,/biosample-types/tissue_UBERON_0000948/,Bernstein pool H of 2 heart biosamples,2,month,female,340,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS396JJT,ENCBS170UTC",/labs/barbara-wold/,UM1HG009443
8,,,/biosample-types/tissue_NTR_0000646/,Bernstein pool I of 4 left cerebral cortex bio...,2,month,female,260,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS653COX,ENCBS075IUK,ENCBS627LAM,ENCBS622XYF",/labs/barbara-wold/,UM1HG009443
9,,,/biosample-types/tissue_NTR_0000646/,Bernstein pool J of 4 left cerebral cortex bio...,2,month,female,260,mg,/organisms/mouse/,/mouse-donors/ENCDO509HIY/,/sources/jackson-labs/,"ENCBS670RYX,ENCBS715GXX,ENCBS685IAU,ENCBS159LYL",/labs/barbara-wold/,UM1HG009443


In [12]:
#pooled_metadata.to_excel(mouse_biosamples / "BioSample_metadata_for_pooled_samples_Jan12_2022.xlsx", index=False)

In [13]:
created = server.post_sheet("/biosamples/", pooled_metadata, dry_run=True)
print(created)

row 0 created: ENCBS064TED
row 1 created: ENCBS546ALS
row 2 created: ENCBS512CTC
row 3 created: ENCBS169TOO
row 4 created: ENCBS707VDM
row 5 created: ENCBS703UIG
row 6 created: ENCBS786IOZ
row 7 created: ENCBS509WQM
row 8 created: ENCBS593FWT
row 9 created: ENCBS590UIC
row 10 created: ENCBS040FAJ
row 11 created: ENCBS243LNW
row 12 created: ENCBS013HSH
row 13 created: ENCBS912QHS
row 14 created: ENCBS185EUZ
row 15 created: ENCBS226RRL
row 16 created: ENCBS602PZD
row 17 created: ENCBS722QLJ
row 18 created: ENCBS232QNB
row 19 created: ENCBS489VYH
row 20 created: ENCBS674BEF
row 21 created: ENCBS281JLB
row 22 created: ENCBS055NMZ
row 23 created: ENCBS337TEC
row 24 created: ENCBS927UQN
row 25 created: ENCBS703FZP
row 26 created: ENCBS717IRX
row 27 created: ENCBS030UBL
row 28 created: ENCBS070KBE
row 29 created: ENCBS098OFG
row 30 created: ENCBS642YJZ
row 31 created: ENCBS796RXY
row 32 created: ENCBS101NOF
row 33 created: ENCBS238RCY
row 34 created: ENCBS776BLV
row 35 created: ENCBS581NAA
ro

In [14]:
pooled_metadata.to_excel(mouse_biosamples / "BioSample_metadata_for_pooled_samples_Jan12_2022.xlsx", index=False)