# Get Protein Sequences

Get the entire amino acid sequence for all genes targeted in endogenous datasets.

In [1]:
import requests
import sys
from datasets import dataset_list
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

In [13]:
def post_sequence_ids(ensembl_ids, data_args=None):
    """Take list of ensemble transcript IDs and return sequence

    :param ensembl_ids: list of str
    :param data_args: dict
    :return: DataFrame
    """
    if data_args is None:
        data_args = {"type": "protein"}
    data_args["ids"] = ensembl_ids
    headers= {"content-type" : "application/json", "Accept" : "application/json"}
    server = "https://rest.ensembl.org"
    ext = "/sequence/id/"
    r = requests.post(server+ext, headers=headers,
                      data=json.dumps(data_args))
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()
    sequence_df = pd.DataFrame(decoded)
    return sequence_df


def get_translation_overlap(ensembl_id, data_args=None):
    if data_args is None:
        data_args = {}
    headers = {'content-type': 'application/json'}
    server = "https://rest.ensembl.org/"
    ext = 'overlap/translation/' + ensembl_id
    r = requests.get(server + ext, headers=headers,
                     data=data_args)
    if not r.ok:
        r.raise_for_status()
        sys.exit()
    decoded = r.json()
    feature_df = pd.DataFrame(decoded)
    return feature_df

In [10]:
import requests, sys

server = "https://rest.ensembl.org"
ext = "/overlap/translation/ENSP00000288602?feature=transcript_variation;type=missense_variant"

r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})

if not r.ok:
  r.raise_for_status()
  sys.exit()

decoded = r.json()
print(repr(decoded))


[{'polyphen': 0.156, 'type': 'missense_variant', 'residues': 'H/R', 'start': 806, 'clinical_significance': [], 'feature_type': 'transcript_variation', 'allele': 'T/C', 'minor_allele_frequency': None, 'translation': 'ENSP00000288602', 'codons': 'cAc/cGc', 'seq_region_name': 'ENSP00000288602', 'id': 'rs1208461159', 'end': 806, 'Parent': 'ENST00000288602', 'sift': 0.02}, {'codons': 'Gat/Aat', 'minor_allele_frequency': None, 'translation': 'ENSP00000288602', 'allele': 'C/T', 'feature_type': 'transcript_variation', 'clinical_significance': ['uncertain significance'], 'start': 717, 'polyphen': 0.341, 'type': 'missense_variant', 'residues': 'D/N', 'sift': 0.02, 'Parent': 'ENST00000288602', 'end': 717, 'seq_region_name': 'ENSP00000288602', 'id': 'rs886041260'}, {'sift': 0, 'end': 588, 'seq_region_name': 'ENSP00000288602', 'id': 'rs1562954580', 'Parent': 'ENST00000288602', 'clinical_significance': ['uncertain significance'], 'feature_type': 'transcript_variation', 'start': 588, 'codons': 'tTt/t

In [3]:
data_list = list()
for ds in dataset_list:
    if ds.endogenous:
        data_list.append(ds)

design_list = list()
for ds in tqdm(data_list):
    ds.load_data()
    ds.set_sgrnas()
    design_list.append(ds.get_designs())

100%|██████████| 8/8 [00:24<00:00,  3.09s/it]


In [4]:
design_df = (pd.concat(design_list)
             .drop_duplicates())
transcripts = (design_df['Target Transcript']
               .str.split('.', expand=True)[0]
               .unique())
len(transcripts)

2998

In [5]:
transcipt_sequence_list = []
for chunk_transcripts in tqdm(np.array_split(transcripts, np.ceil(len(transcripts)/50))):
    transcipt_sequence_list.append(post_sequence_ids(list(chunk_transcripts)))

100%|██████████| 60/60 [05:12<00:00,  5.21s/it]


Missing genes appear primarily to be outdated transcript IDs

In [6]:
transcript_sequence_df = (pd.concat(transcipt_sequence_list)
                          .reset_index(drop=True)
                          .rename({'query': 'Transcript Base'}, axis=1))
transcript_series = pd.Series(transcripts)
missing_sequences = transcript_series[~transcript_series.isin(transcript_sequence_df['Transcript Base'])]
print('Missing: ' + ', '.join(missing_sequences))


Missing: ENST00000611665, ENST00000622530, ENST00000368563, ENST00000377815, ENST00000618014, ENST00000424325, ENST00000650726, ENST00000572844, ENST00000344894, ENST00000355883, ENST00000648169, ENST00000449977


## Get Protein Domains

Get all annotated protein domains based on ptrotein ID. Information about annotations can be found here: http://m.ensembl.org/Help/View?id=178

In [17]:
translation_overlap_list = []
for protein_id in tqdm(transcript_sequence_df['id'].unique()):
    translation_overlap_list.append(get_translation_overlap(protein_id))

100%|██████████| 2986/2986 [38:36<00:00,  1.29it/s] 


In [19]:
translation_overlap_df = (pd.concat(translation_overlap_list)
                          .reset_index(drop=True)
                          .rename({'Parent': 'Transcript Base'}, axis=1))
translation_overlap_df['type'].value_counts()

sifts               18687
Seg                  9515
MobiDBLite           7753
Pfam                 5967
PANTHER              5309
Gene3D               5049
SuperFamily          3898
Smart                3772
Prosite_profiles     3111
PRINTS               2855
CDD                  2321
ncoils               2178
Prosite_patterns     1308
TMHMM                 539
TIGRfam               429
PIRSF                 423
HAMAP                 285
SignalP                56
SFLD                   46
Name: type, dtype: int64

In [21]:
translation_overlap_df['id'].value_counts().head(50)

seg            9515
mobidb-lite    7753
Coil           2178
SM00320         854
TMhelix         539
PF00400         450
PS50082         435
3.40.50.300     402
SSF52540        383
SM00355         285
2.130.10.10     233
PS50157         226
SSF48371        226
3.30.160.60     220
PS00028         216
PR00320         180
SM00028         168
PS00678         157
SSF57667        153
SSF50978        149
PS50294         147
3.30.70.330     145
PS50005         139
SSF54928        136
SM00360         133
SM00386         132
PS50102         130
PF00076         128
1.25.10.10      122
1.25.40.10      117
SSF48452        114
PF00096         108
2.40.50.140     102
SM00382         101
SSF50249        100
PS51194          93
PS51192          93
SM00490          93
SM00487          91
PF00271          89
PS51450          83
PR00315          81
cd00009          71
SSF46785         69
PF00270          67
cd00200          65
SSF56112         65
3.30.40.10       64
PR00304          60
PR00301          58


## Output

In [24]:
transcript_sequence_df.to_csv('../data/interim/aa_seqs.csv', index=False)
translation_overlap_df.to_csv('../data/interim/protein_domains.csv', index=False)