In [2]:
import requests
import os
import pandas as pd
import json

In [226]:
def get_brents_bams(biosample_id):
    
    #This gets the gravely labs bams

    URL = "https://www.encodeproject.org/search/?type=file&dataset=/experiments/{}/&file_format=bam&output_type=alignments&format=json&assembly=hg19&frame=object&limit=all".format(biosample_id)
    response_file = requests.get(URL, headers=HEADERS)
    response_json_dict_file = response_file.json()
    ecc_ids = []
    for item in response_json_dict_file['@graph']:
        if item['lab'] == "/labs/brenton-graveley/":
            ecc_ids.append(item['accession'])
    return ecc_ids

def get_bams_and_controls(accession):
    #Given a bio-sample accession gets brents bams and controls, returns as a dict, possibly formatted as a dataframe
    #This gets the paired control
    URL = "https://www.encodeproject.org/biosample/{}/?frame=object".format(accession)

    response_file = requests.get(URL, headers=HEADERS)
    control_json = response_file.json()
    ctrl_accession = control_json['possible_controls'][0].split("/")[-2]
    ctrl_ids = get_brents_bams(ctrl_accession)
    kd_ids = get_brents_bams(accession)
    
    result_dict = {}
    result_dict['kd_1'] = kd_ids[0]
    result_dict['kd_2'] = kd_ids[1]
    result_dict['ctrl_1'] = ctrl_ids[0]
    result_dict['ctrl_2'] = ctrl_ids[1]
    
    return ctrl_accession, result_dict

def format_bam_for_download(file_id):
     return "wget https://www.encodeproject.org/files/{0}/@@download/{0}.bam".format(file_id)


In [None]:
#Get everything
HEADERS = {'accept': 'application/json'}
URL = "https://www.encodeproject.org/search/?type=Experiment&files.file_type=bam&lab.title=Brenton+Graveley%2C+UConn&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assay_title=shRNA+RNA-seq&limit=all&format=json"

response = requests.get(URL, headers=HEADERS)
response_json_dict = response.json()

datasets = []
for count, experiment_json in enumerate(response_json_dict['@graph']):
    if experiment_json['target']['label'] == 'Non-specific target control':
        continue
    
    accession = experiment_json['accession']

    try:    
        ctrl_accession, kd_exp = get_bams_and_controls(accession)
    except IndexError as e:
        print accession
    kd_exp['cell_type'] = experiment_json['biosample_term_name']
    kd_exp['rbp'] = experiment_json['target']['label']
    kd_exp['kd_accession'] = accession
    kd_exp['ctrl_accession'] = ctrl_accession
    datasets.append(kd_exp)
    
datasets = pd.DataFrame(datasets)

In [None]:
datasets.to_csv("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/20160325_rnai_manifest.csv")

In [225]:
all_commands = set(datasets['kd_1'].apply(format_bam_for_download).values) | set(datasets['kd_2'].apply(format_bam_for_download).values) | set(datasets['ctrl_1'].apply(format_bam_for_download).values) | set(datasets['ctrl_2'].apply(format_bam_for_download).values)

with open("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/batch_download.sh", 'w') as outfile:
    for line in all_commands:
        outfile.write(line + "\n")

#Make the DESeq formats

In [5]:
manifest = pd.read_csv("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/20160325_rnai_manifest.csv", index_col=0)

In [15]:
count_df = pd.read_table("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/all_counts.txt", skiprows=1, index_col=0)
count_df.columns = [os.path.basename(item).split(".")[0] for item in count_df.columns]
count_df[count_df.columns[5:]].to_csv("/home/gpratt/Dropbox/encode_integration/important_counts.csv")

In [28]:
sampleData = pd.concat([manifest[['cell_type', 'ctrl_1', 'ctrl_accession', 'rbp']].rename(columns={"ctrl_1": "bam_id", "ctrl_accession": "accession"}),
manifest[['cell_type', 'ctrl_2', 'ctrl_accession', 'rbp']].rename(columns={"ctrl_2": "bam_id", "ctrl_accession": "accession"}),
manifest[['cell_type', 'kd_1', 'kd_accession', 'rbp']].rename(columns={"kd_1": "bam_id", "kd_accession": "accession"}),
manifest[['cell_type', 'kd_2', 'kd_accession', 'rbp']].rename(columns={"kd_2": "bam_id", "kd_accession": "accession"}),
          ])

In [37]:
manifest.head()

Unnamed: 0,cell_type,ctrl_1,ctrl_2,ctrl_accession,kd_1,kd_2,kd_accession,rbp
0,HepG2,ENCFF350VVR,ENCFF829TUN,ENCSR042QTH,ENCFF450QLW,ENCFF584LUS,ENCSR648BSC,PUF60
1,K562,ENCFF584WAF,ENCFF674JPQ,ENCSR245BNJ,ENCFF860NWB,ENCFF288KRJ,ENCSR410ZPU,BCLAF1
2,HepG2,ENCFF350VVR,ENCFF829TUN,ENCSR042QTH,ENCFF997JOB,ENCFF904ZSQ,ENCSR459EMR,TUFM
3,K562,ENCFF092XAP,ENCFF265ZZB,ENCSR344XID,ENCFF280ZIB,ENCFF708KBX,ENCSR256PLH,QKI
4,HepG2,ENCFF561UZL,ENCFF628UPI,ENCSR689PHN,ENCFF500VFR,ENCFF962VUU,ENCSR771QMJ,GEMIN5


In [38]:
sampleData.drop_duplicates(subset="bam_id").to_csv("/home/gpratt/Dropbox/encode_integration/SampleInfo.csv")

In [36]:
sampleData.head()

Unnamed: 0,cell_type,bam_id,accession,rbp
0,HepG2,ENCFF350VVR,ENCSR042QTH,PUF60
1,K562,ENCFF584WAF,ENCSR245BNJ,BCLAF1
2,HepG2,ENCFF350VVR,ENCSR042QTH,TUFM
3,K562,ENCFF092XAP,ENCSR344XID,QKI
4,HepG2,ENCFF561UZL,ENCSR689PHN,GEMIN5


In [43]:
manifest[manifest.ctrl_accession == "ENCSR084SCN"]

Unnamed: 0,cell_type,ctrl_1,ctrl_2,ctrl_accession,kd_1,kd_2,kd_accession,rbp
38,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF635XVZ,ENCFF312ZOZ,ENCSR608IAI,FASTKD2
83,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF244QXH,ENCFF653PZL,ENCSR268JDD,AGO1
125,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF907AFR,ENCFF174XOJ,ENCSR143UET,EIF3G
132,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF587FDO,ENCFF020XNJ,ENCSR234YMW,SLTM
155,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF459YMO,ENCFF922IOC,ENCSR004RGI,RPS10
167,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF985GAR,ENCFF403FGR,ENCSR410MIQ,RPS3
257,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF983UYL,ENCFF564EFE,ENCSR325OOM,FUS
360,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF774LJE,ENCFF422SED,ENCSR232XRZ,SND1
406,K562,ENCFF178NNK,ENCFF098BEA,ENCSR084SCN,ENCFF749HGR,ENCFF406LTN,ENCSR891AXF,SSB
