# Get Protein Sequences

Get the entire amino acid sequence for all genes targeted in endogenous datasets.

In [1]:
import requests
import sys
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

from datasets import expanded_dataset_list


In [2]:
def post_sequence_ids(ensembl_ids, data_args=None):
    """Take list of ensemble transcript IDs and return sequence

    :param ensembl_ids: list of str
    :param data_args: dict
    :return: DataFrame
    """
    if data_args is None:
        data_args = {"type": "protein"}
    data_args["ids"] = ensembl_ids
    headers= {"content-type" : "application/json", "Accept" : "application/json"}
    server = "https://rest.ensembl.org"
    ext = "/sequence/id/"
    r = requests.post(server+ext, headers=headers,
                      data=json.dumps(data_args))
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()
    sequence_df = pd.DataFrame(decoded)
    return sequence_df


def get_translation_overlap(ensembl_id, data_args=None):
    """Get features that overlap with translation, such as protein domains

    :param ensembl_id: str
    :param data_args: dict
    :return: DataFrame
    """
    if data_args is None:
        data_args = {}
    headers = {'content-type': 'application/json'}
    server = "https://rest.ensembl.org/"
    ext = 'overlap/translation/' + ensembl_id
    r = requests.get(server + ext, headers=headers,
                     data=data_args)
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()
    feature_df = pd.DataFrame(decoded)
    return feature_df

In [3]:
data_list = list()
for ds in expanded_dataset_list:
    if ds.endogenous:
        data_list.append(ds)

design_list = list()
for ds in tqdm(data_list):
    ds.load_data()
    ds.set_sgrnas()
    design_list.append(ds.get_designs())

100%|██████████| 9/9 [00:35<00:00,  3.92s/it]


In [4]:
design_df = (pd.concat(design_list)
             .drop_duplicates())
transcripts = (design_df['Target Transcript']
               .str.split('.', expand=True)[0]
               .unique())
len(transcripts)

3004

In [5]:
transcipt_sequence_list = []
for chunk_transcripts in tqdm(np.array_split(transcripts, np.ceil(len(transcripts)/50))):
    transcipt_sequence_list.append(post_sequence_ids(list(chunk_transcripts)))

100%|██████████| 61/61 [03:46<00:00,  3.72s/it]


Missing genes appear primarily to be outdated transcript IDs from a manual check

In [6]:
transcript_sequence_df = (pd.concat(transcipt_sequence_list)
                          .reset_index(drop=True)
                          .rename({'query': 'Transcript Base'}, axis=1))
transcript_series = pd.Series(transcripts)
missing_sequences = transcript_series[~transcript_series.isin(transcript_sequence_df['Transcript Base'])]
print('Missing: ' + ', '.join(missing_sequences))


Missing: ENST00000611665, ENST00000622530, ENST00000368563, ENST00000377815, ENST00000618014, ENST00000424325, ENST00000650726, ENST00000572844, ENST00000344894, ENST00000355883, ENST00000648169, ENST00000449977


We'll filter any proteins that are not the same length as the length in the designs file

In [7]:
transcript_lens = design_df[['Target Transcript', 'Target Total Length']].drop_duplicates()
transcript_lens['Transcript Base'] =  transcript_lens['Target Transcript'].str.split('.', expand=True)[0]
transcript_sequence_lens = (transcript_sequence_df.merge(transcript_lens, how='inner',
                                                         on='Transcript Base'))
transcript_sequence_lens['AA len'] = transcript_sequence_lens['seq'].str.len()
filtered_transcript_sequence_lens = transcript_sequence_lens[transcript_sequence_lens['Target Total Length'] ==
                                                             (transcript_sequence_lens['AA len'] + 1)*3].copy()
print('Filtered: ' + str(transcript_sequence_df.shape[0] - filtered_transcript_sequence_lens.shape[0]) +
      ' misaligned proteins')

Filtered: 20 misaligned proteins


## Get Protein Domains

Get all annotated protein domains based on protein ID. Information about annotations can be found here:
http://m.ensembl.org/Help/View?id=178

In [8]:
translation_overlap_list = []
for protein_id in tqdm(filtered_transcript_sequence_lens['id'].unique()):
    translation_overlap_list.append(get_translation_overlap(protein_id))

100%|██████████| 2972/2972 [30:11<00:00,  1.64it/s] 


In [9]:
translation_overlap_df = (pd.concat(translation_overlap_list)
                          .reset_index(drop=True)
                          .rename({'Parent': 'Transcript Base'}, axis=1))
translation_overlap_df['type'].value_counts()

sifts               18147
Seg                  9451
MobiDBLite           7842
Pfam                 6049
PANTHER              5532
Gene3D               5111
SuperFamily          3943
Smart                3825
Prosite_profiles     3144
PRINTS               2883
CDD                  2325
ncoils               2201
Prosite_patterns     1322
TMHMM                 542
TIGRfam               435
PIRSF                 427
HAMAP                 285
SignalP                56
SFLD                   46
Name: type, dtype: int64

In [10]:
translation_overlap_df['id'].value_counts().head(50)

seg            9451
mobidb-lite    7842
Coil           2201
SM00320         842
TMhelix         542
PF00400         441
PS50082         426
3.40.50.300     409
SSF52540        385
SM00355         285
2.130.10.10     230
SSF48371        229
PS50157         226
3.30.160.60     221
PS00028         216
PR00320         177
SM00028         173
SM00386         158
SSF57667        154
PS00678         154
SSF50978        147
3.30.70.330     145
PS50294         145
PS50005         141
SSF54928        136
SM00360         133
PS50102         130
PF00076         128
1.25.40.10      127
1.25.10.10      124
SSF48452        122
PF00096         108
2.40.50.140     102
SSF50249        101
SM00382          99
SM00490          96
PS51194          96
PS51192          96
SM00487          94
PF00271          92
PR00315          85
PS51450          83
PF00270          70
SSF46785         69
cd00009          68
SSF56112         67
PR00304          65
3.30.40.10       64
cd00200          64
PS50293          62


## Output

In [11]:
filtered_transcript_sequence_lens.to_csv('../data/interim/aa_seqs.csv', index=False)
translation_overlap_df.to_csv('../data/interim/protein_domains.csv', index=False)