In [55]:
import requests
import os
import pandas as pd
import json

from IPython.core.display import HTML
from gscripts.general import region_helpers

In [36]:
def get_brents_bams(biosample_id):
    
    #This gets the gravely labs bams
    URL = "https://www.encodeproject.org/search/?type=file&dataset=/experiments/{}/&file_format=bam&output_type=alignments&format=json&assembly=hg19&frame=object&limit=all".format(biosample_id)
    response_file = requests.get(URL, headers=HEADERS)
    response_json_dict_file = response_file.json()
    ecc_ids = []
    for item in response_json_dict_file['@graph']:
        if item['lab'] == "/labs/brenton-graveley/":
            ecc_ids.append(item['accession'])
    return ecc_ids

def get_bams_and_controls(accession):
    #Given a bio-sample accession gets brents bams and controls, returns as a dict, possibly formatted as a dataframe
    #This gets the paired control
    URL = "https://www.encodeproject.org/biosample/{}/?frame=object".format(accession)

    response_file = requests.get(URL, headers=HEADERS)
    control_json = response_file.json()
    ctrl_accession = control_json['possible_controls'][0].split("/")[-2]
    replicate = control_json['replicates'][0]
    ctrl_ids = get_brents_bams(ctrl_accession)
    kd_ids = get_brents_bams(accession)
    
    result_dict = {}
    result_dict['kd_1'] = kd_ids[0]
    result_dict['kd_2'] = kd_ids[1]
    result_dict['ctrl_1'] = ctrl_ids[0]
    result_dict['ctrl_2'] = ctrl_ids[1]
    
    return ctrl_accession, replicate, result_dict

def format_bam_for_download(file_id):
     return "wget https://www.encodeproject.org/files/{0}/@@download/{0}.bam".format(file_id)


In [163]:
#Get everything
HEADERS = {'accept': 'application/json'}
URL = "https://www.encodeproject.org/search/?type=Experiment&files.file_type=bam&lab.title=Brenton+Graveley%2C+UConn&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assay_title=shRNA+RNA-seq&limit=all&format=json"

response = requests.get(URL, headers=HEADERS)
response_json_dict = response.json()

datasets = []
for experiment_json in response_json_dict['@graph']:
                
    if experiment_json['target']['label'] == 'Non-specific target control':
        continue
    
    accession = experiment_json['accession']
    
    try:    
        ctrl_accession, replicate, kd_exp = get_bams_and_controls(accession)
    except IndexError as e:
        print accession
        continue
        
    URL = "https://www.encodeproject.org/{}".format(replicate) 
    response_file = requests.get(URL, headers=HEADERS)
    replicate_json = response_file.json()
    selection_method = replicate_json['library']['nucleic_acid_term_name']
    kd_exp['cell_type'] = experiment_json['biosample_term_name']
    kd_exp['rbp'] = experiment_json['target']['label']
    kd_exp['kd_accession'] = accession
    kd_exp['ctrl_accession'] = ctrl_accession
    kd_exp['selection_method'] = selection_method
    
    datasets.append(kd_exp)

datasets = pd.DataFrame(datasets)

ENCSR096BEN
ENCSR703QFF


In [196]:
datasets.to_csv("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/20160325_rnai_manifest.csv")
datasets.to_csv("/home/gpratt/Dropbox/encode_integration/rnai_processing/20160325_rnai_manifest.csv")

manifest = datasets

In [198]:
datasets[datasets.rbp == "SRP68"]

Unnamed: 0,cell_type,ctrl_1,ctrl_2,ctrl_accession,kd_1,kd_2,kd_accession,rbp,selection_method
274,HepG2,ENCFF705LNP,ENCFF038LUH,ENCSR856ZRV,ENCFF402SKG,ENCFF544JIA,ENCSR167JPY,SRP68,polyadenylated mRNA
362,K562,ENCFF742XEQ,ENCFF362TIW,ENCSR661HEL,ENCFF778BJG,ENCFF605SWX,ENCSR312SRB,SRP68,polyadenylated mRNA


In [199]:
all_commands = set(datasets['kd_1'].apply(format_bam_for_download).values) | set(datasets['kd_2'].apply(format_bam_for_download).values) | set(datasets['ctrl_1'].apply(format_bam_for_download).values) | set(datasets['ctrl_2'].apply(format_bam_for_download).values)

with open("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/batch_download.sh", 'w') as outfile:
    for line in all_commands:
        outfile.write(line + "\n")

# Make the DESeq formats

In [200]:
count_df = pd.read_table("/projects/ps-yeolab2/encode/analysis/rnaseq_bams_v2/all_counts.txt", skiprows=1, index_col=0)
count_df.columns = [os.path.basename(item).split(".")[0] for item in count_df.columns]
count_df[count_df.columns[5:]].to_csv("/home/gpratt/Dropbox/encode_integration/important_counts.csv")

In [213]:
sampleData = pd.concat([manifest[['cell_type', 'ctrl_1', 'ctrl_accession', 'rbp']].rename(columns={"ctrl_1": "bam_id", "ctrl_accession": "accession"}),
manifest[['cell_type', 'ctrl_2', 'ctrl_accession', 'rbp']].rename(columns={"ctrl_2": "bam_id", "ctrl_accession": "accession"}),
manifest[['cell_type', 'kd_1', 'kd_accession', 'rbp']].rename(columns={"kd_1": "bam_id", "kd_accession": "accession"}),
manifest[['cell_type', 'kd_2', 'kd_accession', 'rbp']].rename(columns={"kd_2": "bam_id", "kd_accession": "accession"}),
          ])

sampleData = sampleData.drop_duplicates(subset="bam_id")

In [216]:
sampleData.to_csv("/home/gpratt/Dropbox/encode_integration/SampleInfo.csv")

In [215]:
sampleData[(sampleData.accession == "ENCSR856ZRV") | (sampleData.accession == "ENCSR573UBF")]

Unnamed: 0,cell_type,bam_id,accession,rbp
7,HepG2,ENCFF705LNP,ENCSR856ZRV,CSTF2
7,HepG2,ENCFF038LUH,ENCSR856ZRV,CSTF2
69,HepG2,ENCFF788JGG,ENCSR573UBF,TFIP11
69,HepG2,ENCFF132PEP,ENCSR573UBF,TFIP11


# Make Master Spreadsheet joining all encode RBPs

In [189]:
gene_id_to_name = region_helpers.gene_id_to_name("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db")
gene_id_to_type = region_helpers.gene_id_to_type("/projects/ps-yeolab/genomes/hg19/gencode/v19/gencode.v19.annotation.gtf.db")
ensembl_id_to_name = {key.split(".")[0]: value for key, value in gene_id_to_name.items()}

name_to_gene_id = {value: key for key, value in gene_id_to_name.items()}

In [190]:
clip_analysis = pd.read_table("/home/elvannostrand/data/clip/CLIPseq_analysis/ALLCLIP_v12_20160112/encode_v12_filelist.allencode_20160314.txt.fixedgenename.txt")

In [191]:
manifest[manifest.kd_accession == "ENCSR660MZN"]

Unnamed: 0,cell_type,ctrl_1,ctrl_2,ctrl_accession,kd_1,kd_2,kd_accession,rbp,selection_method
78,HepG2,ENCFF610JDV,ENCFF481PMW,ENCSR279HMU,ENCFF616QND,ENCFF549PGZ,ENCSR660MZN,HNRNPD,polyadenylated mRNA


In [192]:
merged = pd.merge(clip_analysis, manifest, left_on=['Cell line', 'RBP'], right_on=['cell_type', 'rbp'], how="outer")
merged['RBP'] = [rbp_1 if not pd.isnull(rbp_1) else rbp_2 for rbp_1, rbp_2 in zip(merged.rbp, merged.RBP)]
merged['Cell line'] = [cell_1 if not pd.isnull(cell_1) else cell_2 for cell_1, cell_2 in zip(merged.cell_type, merged['Cell line'])]
merged['RBP_ENSG'] = [name_to_gene_id[rbp] for rbp in merged.RBP]
merged = merged.drop(["rbp", 'cell_type'], axis=1)
merged = merged.rename(columns={"RBP": "RBP_gID", 
              "Cell line": "CellLine",
              "kd_accession": "RNASEQ_ENCODEAccID",
              "ctrl_accession": "RNASEQ_ControlENC",
              'kd_1': "RNASEQ_KDRep1Bam",
              'kd_2': "RNASEQ_KDRep2Bam",
              "ctrl_1": "RNASEQ_ControlRep1Bam",
              "ctrl_2": "RNASEQ_ControlRep2Bam",}
             )

merged.RNASEQ_ControlRep1Bam += ".bam"
merged.RNASEQ_ControlRep2Bam += ".bam"
merged.RNASEQ_KDRep1Bam += ".bam"
merged.RNASEQ_KDRep2Bam += ".bam"

In [193]:
merged.to_csv("/home/gpratt/Dropbox/encode_integration/20160408_ENCODE_MASTER_ID_LIST_AllDatasets.csv", sep="\t")

In [194]:
merged[merged.RNASEQ_ENCODEAccID == "ENCSR624XHG"]

Unnamed: 0,uID,RBP_gID,CellLine,CLIP_rep1,CLIP_rep2,INPUT,RNASEQ_ControlRep1Bam,RNASEQ_ControlRep2Bam,RNASEQ_ControlENC,RNASEQ_KDRep1Bam,RNASEQ_KDRep2Bam,RNASEQ_ENCODEAccID,selection_method,RBP_ENSG
165,481,DROSHA,K562,/projects/ps-yeolab2/encode/analysis/encode_v1...,/projects/ps-yeolab2/encode/analysis/encode_v1...,/projects/ps-yeolab2/encode/analysis/encode_v1...,ENCFF490QNF.bam,ENCFF669VSB.bam,ENCSR164MUK,ENCFF772BWX.bam,ENCFF475GCB.bam,ENCSR624XHG,polyadenylated mRNA,ENSG00000113360.12


In [195]:
merged[merged.RBP_gID == "DROSHA"]

Unnamed: 0,uID,RBP_gID,CellLine,CLIP_rep1,CLIP_rep2,INPUT,RNASEQ_ControlRep1Bam,RNASEQ_ControlRep2Bam,RNASEQ_ControlENC,RNASEQ_KDRep1Bam,RNASEQ_KDRep2Bam,RNASEQ_ENCODEAccID,selection_method,RBP_ENSG
165,481,DROSHA,K562,/projects/ps-yeolab2/encode/analysis/encode_v1...,/projects/ps-yeolab2/encode/analysis/encode_v1...,/projects/ps-yeolab2/encode/analysis/encode_v1...,ENCFF490QNF.bam,ENCFF669VSB.bam,ENCSR164MUK,ENCFF772BWX.bam,ENCFF475GCB.bam,ENCSR624XHG,polyadenylated mRNA,ENSG00000113360.12
