# Introduction

Step one in replicating the plots is to grab the data.

In [1]:
import sys

!$sys.executable -m pip install --user encoded_client



In [2]:
from encoded_client.encoded import ENCODED
import os
from pathlib import Path
import pandas
import shutil
from tqdm import tqdm

server = ENCODED("www.encodeproject.org")

In [3]:
cart_url = "https://www.encodeproject.org/carts/829d339c-913c-4773-8001-80130796a367/"

In [4]:
server.get_response?

In [5]:
def get_genome_bam(experiment_id, cache_dir=None):
    experiment = server.get_json(experiment_id)

    metadata = []
    default_analysis = experiment["default_analysis"]
    for f in experiment["files"]:
        analyses = [x["@id"] for x in f["analyses"]]
        # this check only applies to processed data not reads (default_analysis in analyses and)
        if f["output_type"] in ("reads", "unfiltered alignments"):
            if f["output_type"] == "unfiltered alignments" and not default_analysis in analyses:
                # skip alignments for older analyses.
                continue
            reps = f["technical_replicates"]
            assert len(reps) == 1
            reps = reps[0]
            
            extension = {
                "fastq": ".fastq.gz",
                "bam": ".bam",
            }[f["file_format"]]
            target_name = "{}_{}_{}_{}{}".format(experiment["accession"], reps, f["accession"], f["output_type"], extension)
            if cache_dir is None:
                cache_dir = os.getcwd()
            cache_dir = Path(cache_dir)
            target = cache_dir / target_name
            
            if not target.exists() or target.lstat().st_size != f["file_size"]:
                with server.get_response(f["href"], stream=True) as response:
                    with open(target, 'wb') as outstream:
                        shutil.copyfileobj(response.raw, outstream)

            metadata.append({
                "experiment": experiment["accession"],
                "description": experiment["description"],
                "simple_biosample_summary": f["simple_biosample_summary"],
                "file": f["accession"],
                "output_type": f["output_type"],
                "file_size": f["file_size"],
                "bio_rep": f["biological_replicates"][0],
                "tech_rep": f["technical_replicates"][0],
                "target": target,
            })
            
    return metadata
        
#get_genome_bam("/experiments/ENCSR690QHM/")

In [6]:
cache_dir = Path("cache")
cache_dir.mkdir(exist_ok=True)

cart = server.get_json(cart_url)

metadata = []
for experiment_id in tqdm(cart["elements"]):
    metadata.extend(get_genome_bam(experiment_id, cache_dir))

100%|██████████| 95/95 [1:14:53<00:00, 47.30s/it]


In [7]:
downloaded_data = pandas.DataFrame(metadata)
downloaded_data.to_csv("cached_experiments.tsv", sep="\t", index=False)

In [8]:
#for row in tqdm(metadata):
#    index = Path(str(row["target"]) + ".bai")
#    target = str(row["target"])
#    if not index.exists():
#        !samtools index -@ 12 $target

# well oops. I named it wrong ..bam

In [9]:
for row in tqdm([]): #metadata):
    orig_target = row["target"]
    if orig_target.name.endswith("..bam"):
        new_target = str(orig_target)
        new_target = new_target.replace("..bam", ".bam")
        if new_target != orig_target:
            #print("Rename {} {}".format(orig_target, new_target))
            orig_target.rename(new_target)
            row["target"] = Path(new_target)
        
        

0it [00:00, ?it/s]


Object `p.rename` not found.


# Well oops I downloaded obsolete files.

In [16]:
expected_targets = set(downloaded_data["target"])
total_files = 0
deleted_files = 0
for name in cache_dir.glob("*"):
    total_files += 1
    if name not in expected_targets:
        deleted_files += 1
        print("Delete {}".format(name))
        name.unlink()
        
        
print("Deleted {} of {} files".format(deleted_files, total_files))

Delete cache/ENCSR589FUJ_1_1_ENCFF965OXS_unfiltered alignments.bam
Delete cache/ENCSR293MOX_1_1_ENCFF483BXP_unfiltered alignments.bam
Delete cache/ENCSR902GAF_1_1_ENCFF049KQS_unfiltered alignments.bam
Delete cache/ENCSR589FUJ_2_1_ENCFF533IBY_unfiltered alignments.bam
Delete cache/ENCSR902GAF_3_1_ENCFF203MFP_unfiltered alignments.bam
Delete cache/ENCSR902GAF_2_1_ENCFF825CMK_unfiltered alignments.bam
Deleted 6 of 282 files
