In [None]:
# default_exp targetdata

# targetdata

> Get underlying data to calculate Rule Set 3 target scores


Get amino acid sequences and protein domain information

In [None]:
# export
import requests
import json
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import warnings
import os

In [None]:
design_df = pd.read_table('test_data/sgrna-designs.txt')

## Get amino acid sequences

In [None]:
# export
def ensembl_post(ext, data, headers=None, params=None):
    """Generic wrapper for using POST requests to the ensembl rest API

    :param ext: str, url extension
    :param data: dict, query data
    :param headers: dict or None,  meta-information for query
    :param params: dict or None, parameters for query
    :return: Response object
    """
    if params is None:
        params = {}
    if headers is None:
        headers = {}
    data = json.dumps(data)
    r = requests.post("https://rest.ensembl.org"+ext, headers=headers, data=data, params=params)
    if not r.ok:
        r.raise_for_status()
    else:
        return r

In [None]:
# export
def chunks(lst, n):
    """Yield successive n-sized chunks from lst.

    lst: list
    n: int

    returns: generator of list chunks
    """
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def post_transcript_sequence_chunk(ids, params, headers):
    """Helper function for post_transcript_sequence

    :param ids: list
    :param params: dict
    :param headers: dict
    :return: dict
    """
    data = {'ids': ids}
    r = ensembl_post("/sequence/id/", data=data, params=params,
                     headers=headers)
    seq = r.json()
    return seq

def post_transcript_sequence(ensembl_ids, seq_type='protein', max_queries=50,
                             n_jobs=1, **kwargs):
    """Request multiple types of sequence by stable identifier. Supports feature masking and expand options.
    Uses https://rest.ensembl.org/documentation/info/sequence_id_post

    :param ensembl_ids: list of str
    :param seq_type: str, one of [genomic, cds, cdna, protein]
    :param max_queries: int, maximum number of queries for post
    :param n_jobs: int, number of jobs to run in parallel
    :param kwargs: additional parameter arguments
    :return: list, dict of sequences 5' to 3' in the same orientation as the input transcript
    """
    headers={"content-type" : "application/json", "accept" : "application/json"}
    params = {'type': seq_type, **kwargs}
    id_chunks = list(chunks(ensembl_ids, max_queries))
    seqs = Parallel(n_jobs=n_jobs)(delayed(post_transcript_sequence_chunk)
                                   (ids, params, headers) for ids in tqdm(id_chunks))
    # flatten list
    seqs = [item for sublist in seqs for item in sublist]
    return seqs

In [None]:
assert(post_transcript_sequence(["ENSG00000157764", "ENSG00000248378"],
                                seq_type='genomic')[0]['seq'][:4] == 'CTTC')

100%|██████████| 1/1 [00:01<00:00,  1.31s/it]


In [None]:
# export
def build_transcript_aa_seq_df(design_df, transcript_id_col='Target Transcript',
                               transcript_len_col='Target Total Length', n_jobs=1):
    """Get amino acid sequence for transcripts of interest

    :param design_df: DataFrame
    :param transcript_id_col: str, column with ensembl transcript id
    :param transcript_len_col: str, column with length of transcript
    :param n_jobs: int, number of jobs to use to query transcripts
    :return: DataFrame
    """
    unique_transcripts = design_df[[transcript_id_col, transcript_len_col]].drop_duplicates()
    unique_transcripts['Transcript Base'] = unique_transcripts[transcript_id_col].str.split('.', expand=True)[0]
    print("Getting amino acid sequences")
    aa_seqs = post_transcript_sequence(unique_transcripts['Transcript Base'].to_list(),
                                       n_jobs=n_jobs)
    aa_seq_df = (pd.DataFrame(aa_seqs)
                 .rename({'query': 'Transcript Base'}, axis=1))
    missing_seqs = (unique_transcripts['Transcript Base'][~unique_transcripts['Transcript Base'].isin(
        aa_seq_df['Transcript Base']
    )])
    if len(missing_seqs) > 0:
        warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs))
    aa_seq_len_df = (unique_transcripts.merge(aa_seq_df, on='Transcript Base'))
    aa_seq_len_df['AA len'] = aa_seq_len_df['seq'].str.len()
    filtered_aa_seq_len_df = (aa_seq_len_df[aa_seq_len_df[transcript_len_col] ==
                                            (aa_seq_len_df['AA len'] + 1)*3 ]
                              .reset_index(drop=True))
    filtered_seqs = (aa_seq_len_df['Transcript Base'][~aa_seq_len_df['Transcript Base'].isin(
        filtered_aa_seq_len_df['Transcript Base']
    )])
    if len(filtered_seqs) > 0:
        warnings.warn('Filtered transcripts where the transcript length and amino acid ' +
                      'sequence length did not agree: ' + ', '.join(filtered_seqs))
    return filtered_aa_seq_len_df

In [None]:
transcript_aa_seq_df = build_transcript_aa_seq_df(design_df, n_jobs=3)
transcript_aa_seq_df

Getting amino acid sequences


100%|██████████| 4/4 [00:00<00:00, 47.11it/s]


Unnamed: 0,Target Transcript,Target Total Length,Transcript Base,id,desc,seq,molecule,version,AA len
0,ENST00000259457.8,834,ENST00000259457,ENSP00000259457,,MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...,protein,3,277
1,ENST00000394249.8,1863,ENST00000394249,ENSP00000377793,,MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...,protein,3,620
2,ENST00000361337.3,2298,ENST00000361337,ENSP00000354522,,MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...,protein,2,765
3,ENST00000368328.5,267,ENST00000368328,ENSP00000357311,,MALSTIVSQRKQIKRKAPRGFLKRVFKRKKPQLRLEKSGDLLVHLN...,protein,4,88
4,ENST00000610426.5,783,ENST00000610426,ENSP00000483484,,MPQNEYIELHRKRYGYRLDYHEKKRKKESREAHERSKKAKKMIGLK...,protein,1,260
...,...,...,...,...,...,...,...,...,...
195,ENST00000339374.11,1371,ENST00000339374,ENSP00000343041,,MAAALKCLLTLGRWCPGLGVAPQARALAALVPGVTQVDNKSGFLQK...,protein,6,456
196,ENST00000393047.8,882,ENST00000393047,ENSP00000376767,,MSRIPLGKVLLRNVIRHTDAHNKIQEESDMWKIRELEKQMEDAYRG...,protein,3,293
197,ENST00000454402.7,1023,ENST00000454402,ENSP00000408295,,METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...,protein,2,340
198,ENST00000254998.3,423,ENST00000254998,ENSP00000254998,,MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...,protein,2,140


## Get protein domains

In [None]:
# export
def ensembl_get(ext, query=None, headers=None, params=None):
    """Generic wrapper for using GET requests to the ensembl rest API

    ext: str, url extension |
    query: str or None, end of url extension specifying species, taxon, esnembl_id etc |
    headers: dict or None,  meta-information for query |
    params: dict or None, parameters for query |

    returns: Response object
    """
    if query is None:
        query = ''
    if params is None:
        params = {}
    if headers is None:
        headers = {}
    r = requests.get("https://rest.ensembl.org"+ext+query, params=params, headers=headers)
    if not r.ok:
        r.raise_for_status()
    else:
        return r

def get_translation_overlap(ensembl_id):
    """Get features that overlap with translation, such as protein domains

    :param ensembl_id: str
    :return: DataFrame
    """
    headers = {'content-type': 'application/json'}
    ext = '/overlap/translation/' + ensembl_id
    r = ensembl_get(ext, headers=headers)
    decoded = r.json()
    return decoded

In [None]:
brca1_overlap = get_translation_overlap('ENSP00000350283')
assert 'BRCT domain' in pd.DataFrame(brca1_overlap)['description'].to_list()

In [None]:
# export
def build_translation_overlap_df(protein_ids, n_jobs=1):
    """Get protein domain information

    :param protein_ids: list of str, ensemble protein IDs
    :param n_jobs: int
    :return: DataFrame
    """
    print('Getting protein domains')
    translation_overlap_list = Parallel(n_jobs=n_jobs)(delayed(get_translation_overlap)
                                                       (id) for id in tqdm(protein_ids))
    # flatten list
    translation_overlap_list = [item for sublist in translation_overlap_list for item in sublist]
    translation_overlap_df = pd.DataFrame(translation_overlap_list).rename({'Parent': 'Transcript Base'}, axis=1)
    return translation_overlap_df

In [None]:
translation_overlap_df = build_translation_overlap_df(transcript_aa_seq_df['id'],
                                                      n_jobs=3)
translation_overlap_df

Getting protein domains


100%|██████████| 200/200 [00:35<00:00,  5.61it/s]


Unnamed: 0,feature_type,hit_end,type,cigar_string,id,hit_start,end,description,seq_region_name,hseqname,interpro,translation_id,start,Transcript Base,align_type
0,protein_feature,233,Gene3D,,3.60.20.10,1,277,"Nucleophile aminohydrolases, N-terminal",ENSP00000259457,3.60.20.10,IPR029055,976188,44,ENST00000259457,
1,protein_feature,36,Pfam,,PF12465,1,271,"Proteasome beta subunit, C-terminal",ENSP00000259457,PF12465,IPR024689,976188,235,ENST00000259457,
2,protein_feature,190,Pfam,,PF00227,2,221,"Proteasome, subunit alpha/beta",ENSP00000259457,PF00227,IPR001353,976188,41,ENST00000259457,
3,protein_feature,0,PRINTS,,PR00141,0,66,"Peptidase T1A, proteasome beta-subunit",ENSP00000259457,PR00141,IPR000243,976188,51,ENST00000259457,
4,protein_feature,0,PRINTS,,PR00141,0,182,"Peptidase T1A, proteasome beta-subunit",ENSP00000259457,PR00141,IPR000243,976188,171,ENST00000259457,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5275,protein_feature,676,PANTHER,,PTHR14927,5,682,Nucleolar protein 10/Enp2,ENSP00000371101,PTHR14927,IPR040382,975385,1,ENST00000381685,
5276,protein_feature,0,Seg,,seg,0,474,,ENSP00000371101,seg,,975385,451,ENST00000381685,
5277,protein_feature,0,Seg,,seg,0,537,,ENSP00000371101,seg,,975385,515,ENST00000381685,
5278,protein_feature,0,Seg,,seg,0,554,,ENSP00000371101,seg,,975385,543,ENST00000381685,


## Store data using parquet for later use

In [None]:
# export
def write_transcript_data(design_df, transcript_id_col='Target Transcript',
                          transcript_len_col='Target Total Length', n_jobs=1,
                          overwrite=True, filepath='./data/target_data/',
                          aa_seq_name='aa_seqs.pq',
                          protein_domain_name='protein_domains.pq'):
    """Write amino acid sequences and protein domain information to parquet files

    :param design_df: DataFrame
    :param transcript_id_col: str
    :param transcript_len_col: str
    :param n_jobs: int
    :param overwrite: bool, whether to overwrite existing file
    :param filepath: str, directory for output sequences
    :param aa_seq_name: str, name of amino acid sequence file
    :param protein_domain_name: str, name of protein domain file
    """
    if (os.path.isfile(filepath + aa_seq_name) or os.path.isfile(filepath + protein_domain_name)) and (not overwrite):
        raise ValueError('Transcript data already exits and cannot be overwritten')
    else:
        transcript_aa_seq_df = build_transcript_aa_seq_df(design_df, transcript_id_col=transcript_id_col,
                                                          transcript_len_col=transcript_len_col,
                                                          n_jobs=n_jobs)
        translation_overlap_df = build_translation_overlap_df(transcript_aa_seq_df['id'],
                                                              n_jobs=n_jobs)
        if not os.path.isdir(filepath):
            print('Creating new directory ' + filepath)
            os.makedirs(filepath)
        transcript_aa_seq_df.to_parquet(path=filepath + aa_seq_name, engine='pyarrow',
                                        index=False)
        translation_overlap_df.to_parquet(path=filepath + protein_domain_name, engine='pyarrow',
                                          index=False)

In [None]:
write_transcript_data(design_df, n_jobs=3)
assert os.path.isfile('./data/target_data/' + 'aa_seqs.pq')

Getting amino acid sequences


100%|██████████| 4/4 [00:00<00:00, 2288.22it/s]


Getting protein domains


100%|██████████| 200/200 [00:32<00:00,  6.11it/s]
