In [17]:
import pandas as pd
import numpy as np
import requests, sys
import tqdm
import pickle

In [2]:
#Gets all the rnaseq gene outputs - need to filter zero rows
rna_seq = pd.read_csv('../../Data/RawData/PPG_RNAseq_covariance_input_0419.tsv',sep='\t',index_col=0)
rna_seq.drop(columns=['Gene.description'],inplace=True)

#Get the gene names with known gene to protein mappings
am_mappings = pd.read_csv('../../Data/ProcessedData/AM_SeqMappings.tsv',sep='\t',usecols=[0])
at2_mappings = pd.read_csv('../../Data/ProcessedData/AT2_SeqMappings.tsv',sep='\t',usecols=[0])

In [3]:
#PPG Original Data
PPG_AM_AT2 = '../../Data/ProcessedData/GP_data.tsv'

In [4]:
#True or false if gene contains a rna_seq signal
present_index = (rna_seq.loc[:,rna_seq.columns.difference(['Gene.names','AvgChrs'])]!=0).any(axis='columns')
rna_seq = rna_seq[present_index]
rna_seq

Unnamed: 0_level_0,Gene.names,AvgChrs,AT2_04M_F0,AT2_18M_F0,AT2_04M_F10,AT2_18M_F10,AM_04M_F0,AM_18M_F0,AM_04M_F10,AM_18M_F10
Row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSMUSG00000000001,Gnai3,2.675624,7.151432,7.134999,7.301802,7.126699,7.808586,7.735874,7.599455,7.866372
ENSMUSG00000000028,Cdc45,15.191392,1.347537,0.520289,1.752669,1.091367,2.649199,1.417394,3.991956,3.696447
ENSMUSG00000000056,Narf,10.993152,4.248040,4.163523,4.249110,4.224477,4.329426,3.781370,4.352047,3.913531
ENSMUSG00000000058,Cav2,5.115437,6.162987,5.953024,6.113755,5.937759,6.605884,7.103410,6.679218,6.755340
ENSMUSG00000000078,Klf6,12.048712,6.635019,6.649572,6.795728,6.626275,8.641117,8.631336,8.324152,8.720886
...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000104507,A430027H14Rik,0.968651,1.972572,2.223393,2.113003,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000104512,Gm37165,17.415867,0.000000,0.163523,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000104515,Gm37163,0.668781,0.000000,0.520289,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000104519,Gm37161,0.644188,0.000000,0.000000,0.654323,0.000000,0.000000,0.000000,0.000000,0.000000


In [5]:
#Use union of at2 and am mappings to get genes not in the union
mappend_genes = list(set(at2_mappings['Gene.names'])|set(am_mappings['Gene.names']))
#filter out genes that we already have
rna_seq = rna_seq[~rna_seq['Gene.names'].isin(mappend_genes)]
rna_seq

Unnamed: 0_level_0,Gene.names,AvgChrs,AT2_04M_F0,AT2_18M_F0,AT2_04M_F10,AT2_18M_F10,AM_04M_F0,AM_18M_F0,AM_04M_F10,AM_18M_F10
Row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSMUSG00000000028,Cdc45,15.191392,1.347537,0.520289,1.752669,1.091367,2.649199,1.417394,3.991956,3.696447
ENSMUSG00000000056,Narf,10.993152,4.248040,4.163523,4.249110,4.224477,4.329426,3.781370,4.352047,3.913531
ENSMUSG00000000058,Cav2,5.115437,6.162987,5.953024,6.113755,5.937759,6.605884,7.103410,6.679218,6.755340
ENSMUSG00000000078,Klf6,12.048712,6.635019,6.649572,6.795728,6.626275,8.641117,8.631336,8.324152,8.720886
ENSMUSG00000000085,Scmh1,3.769722,5.245503,5.186867,5.240319,5.348280,3.119428,4.235043,2.679218,3.858996
...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000104507,A430027H14Rik,0.968651,1.972572,2.223393,2.113003,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000104512,Gm37165,17.415867,0.000000,0.163523,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000104515,Gm37163,0.668781,0.000000,0.520289,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000104519,Gm37161,0.644188,0.000000,0.000000,0.654323,0.000000,0.000000,0.000000,0.000000,0.000000


In [6]:
#This is the list/index of ensemble identifiers that I need to get sequences for
rna_seq.index

Index(['ENSMUSG00000000028', 'ENSMUSG00000000056', 'ENSMUSG00000000058',
       'ENSMUSG00000000078', 'ENSMUSG00000000085', 'ENSMUSG00000000126',
       'ENSMUSG00000000127', 'ENSMUSG00000000131', 'ENSMUSG00000000134',
       'ENSMUSG00000000142',
       ...
       'ENSMUSG00000104444', 'ENSMUSG00000104445', 'ENSMUSG00000104448',
       'ENSMUSG00000104453', 'ENSMUSG00000104498', 'ENSMUSG00000104507',
       'ENSMUSG00000104512', 'ENSMUSG00000104515', 'ENSMUSG00000104519',
       'ENSMUSG00000104523'],
      dtype='object', name='Row.names', length=10475)

In [7]:
len(set(rna_seq.index))

10475

In [8]:
# with open('gene_list.txt','w') as file:
#     for gene in rna_seq.index:
#         file.write(f'{gene}\n')

### Using the fetched data from biomart
Get a list to retreive protein sequences from uniprot

In [9]:
len(set(pd.read_csv('../../Data/missing_proteins/ppg_missing_protein_ensemble_genemappings.tsv',sep='\t',usecols=[1])['Gene stable ID']))

8635

In [10]:
missing_proteins = pd.read_csv('../../Data/missing_proteins/ppg_missing_protein_ensemble_genemappings.tsv',sep='\t',usecols=[2])

In [11]:
missing_proteins[~missing_proteins.isna().squeeze()]

Unnamed: 0,UniParc ID
0,UPI0000001A56
1,UPI00001EA23E
2,UPI00015AA08D
3,UPI00000275F5
4,UPI00000019C0
...,...
30154,UPI00000295E8
30155,UPI00053BD627
30156,UPI00053BD77E
30157,UPI0000029603


In [96]:
# with open('../../Data/missing_proteins/mising_ppg_proteins_uniparc.txt','w') as file:
#     for protein in missing_proteins[~missing_proteins.isna().squeeze()].values.squeeze():
#         file.write(f'{protein}\n')

In [12]:
uniparc_mappings = pd.read_csv('../../Data/missing_proteins/ppg_missing_protein_ensemble_genemappings.tsv',sep='\t',index_col=[1])
uniparc_mappings.drop_duplicates(inplace=True)
uniparc_mappings = uniparc_mappings[~uniparc_mappings['UniParc ID'].isna()]

In [13]:
#There are still duplicated UniPard identifiers after dropping duplicate rows and removing NA
uniparc_mappings

Unnamed: 0_level_0,Gene name,UniParc ID,UniProtKB/Swiss-Prot ID,UniProtKB/TrEMBL ID
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000000028,Cdc45,UPI0000001A56,Q9Z1X9,Q3UI99
ENSMUSG00000000028,Cdc45,UPI00001EA23E,,F8WJ72
ENSMUSG00000000028,Cdc45,UPI00015AA08D,,D3Z0L5
ENSMUSG00000000056,Narf,UPI00000275F5,Q9CYQ7,
ENSMUSG00000000058,Cav2,UPI00000019C0,Q9WVC3,
...,...,...,...,...
ENSMUSG00000104063,Pcdhgb7,UPI00000295E8,,Q91XX3
ENSMUSG00000104213,Ighd,UPI00053BD627,,A0A0A6YW14
ENSMUSG00000104213,Ighd,UPI00053BD77E,,A0A0A6YXD7
ENSMUSG00000104346,Pcdhga3,UPI0000029603,,Q91XY5


In [14]:
len(set(uniparc_mappings['UniParc ID']))

26333

In [15]:
uniparc_mappings = uniparc_mappings[~uniparc_mappings.duplicated('UniParc ID')]
uniparc_mappings

Unnamed: 0_level_0,Gene name,UniParc ID,UniProtKB/Swiss-Prot ID,UniProtKB/TrEMBL ID
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000000028,Cdc45,UPI0000001A56,Q9Z1X9,Q3UI99
ENSMUSG00000000028,Cdc45,UPI00001EA23E,,F8WJ72
ENSMUSG00000000028,Cdc45,UPI00015AA08D,,D3Z0L5
ENSMUSG00000000056,Narf,UPI00000275F5,Q9CYQ7,
ENSMUSG00000000058,Cav2,UPI00000019C0,Q9WVC3,
...,...,...,...,...
ENSMUSG00000104063,Pcdhgb7,UPI00000295E8,,Q91XX3
ENSMUSG00000104213,Ighd,UPI00053BD627,,A0A0A6YW14
ENSMUSG00000104213,Ighd,UPI00053BD77E,,A0A0A6YXD7
ENSMUSG00000104346,Pcdhga3,UPI0000029603,,Q91XY5


In [18]:
with open('../../Data/missing_proteins/uniparc_embeddings.pkl', 'rb') as file:
    uniparc_embeddings = pickle.load(file)

In [19]:
len(set(uniparc_mappings.index))

8625

In [20]:
uniparc_mappings = uniparc_mappings.join(uniparc_embeddings,on='UniParc ID')
uniparc_mappings['seq_length'] = uniparc_mappings['protein_sequence'].str.len()
uniparc_mappings

Unnamed: 0_level_0,Gene name,UniParc ID,UniProtKB/Swiss-Prot ID,UniProtKB/TrEMBL ID,protein_sequence,avg_embedding,final_hidden,final_cell,seq_length
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSMUSG00000000028,Cdc45,UPI0000001A56,Q9Z1X9,Q3UI99,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...,"[-0.05150509, 0.09288624, -0.12802534, -0.9785...","[-0.043791275, 0.048134997, -0.34964657, -0.96...","[-0.15412268, 0.26008302, -0.52376455, -5.8684...",566
ENSMUSG00000000028,Cdc45,UPI00001EA23E,,F8WJ72,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...,"[-0.053644728, 0.09017045, -0.12686643, -0.980...","[-0.043771926, 0.048104938, -0.34865156, -0.96...","[-0.15414578, 0.2600205, -0.5224536, -5.86933,...",520
ENSMUSG00000000028,Cdc45,UPI00015AA08D,,D3Z0L5,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...,"[-0.040399004, 0.11343334, -0.09757666, -0.970...","[-0.02742713, 0.1427878, -0.03995195, -0.97798...","[-0.07800863, 0.5499704, -0.086833574, -8.6846...",136
ENSMUSG00000000056,Narf,UPI00000275F5,Q9CYQ7,,MKCEHCTRKECSKKSKTDDQENVSSDGAQPSDGASPAKESEEKGEF...,"[-0.075725555, 0.10475193, -0.09333034, -0.977...","[0.04253189, 0.10435928, -0.089505516, -0.9834...","[0.08490218, 0.5082729, -0.1949954, -7.1791, -...",462
ENSMUSG00000000058,Cav2,UPI00000019C0,Q9WVC3,,MGLETEKADVQLFMADDAYSHHSGVDYADPEKYVDSSHDRDPHQLN...,"[-0.047078904, 0.11453671, 0.027847176, -0.965...","[-0.0141451955, 0.08090255, 0.04514965, -0.980...","[-0.036824368, 0.3290622, 0.0523856, -3.024334...",162
...,...,...,...,...,...,...,...,...,...
ENSMUSG00000104063,Pcdhgb7,UPI00000295E8,,Q91XX3,MGGSSARRKRPGRPQVLFILLLPLFCPALGQPVRYSIPEELDRGSV...,"[0.071270004, 0.16919293, -0.15501624, -0.9290...","[-0.28734416, -0.027888788, -0.13889621, -0.90...","[-0.31934956, -0.16173685, -0.27824548, -8.316...",929
ENSMUSG00000104213,Ighd,UPI00053BD627,,A0A0A6YW14,XNEKGPDMFLLSECKAPEENEKINLGCLVIGSQPLKISWEPKKSSI...,"[-0.04842445, 0.06502486, -0.09176653, -0.9551...","[-0.2216764, 0.07647279, -0.218317, -0.9276922...","[-0.44761574, 0.4017731, -0.27147987, -6.66406...",291
ENSMUSG00000104213,Ighd,UPI00053BD77E,,A0A0A6YXD7,XNEKGPDMFLLSECKAPEENEKINLGCLVIGSQPLKISWEPKKSSI...,"[-0.05139191, 0.05688782, -0.11673052, -0.9590...","[-0.008530227, 0.074551344, -0.081188045, -0.9...","[-0.022237688, 0.24355307, -0.10157707, -4.088...",258
ENSMUSG00000104346,Pcdhga3,UPI0000029603,,Q91XY5,MIRLGQGTAGALLCALLGTLCAAGFRQIRYSVPEELDKGSFVGNIS...,"[0.06706869, 0.18330085, -0.14712377, -0.92589...","[-0.23818472, -0.0286825, -0.1442578, -0.90462...","[-0.2615678, -0.17263463, -0.2835333, -8.47174...",928


In [196]:
#uniparc_mappings.to_csv('../../Data/missing_proteins/uniparc_embedding_mapping.tsv',sep='\t')
# with open('../../Data/missing_proteins/uniparc_embedding_mapping.pkl', 'wb') as file:
#     pickle.dump(uniparc_mappings, file, protocol=pickle.HIGHEST_PROTOCOL)

### Using just the longest sequence for each gene

In [21]:
uniparc_mappings_longest = uniparc_mappings.reset_index().sort_values('seq_length',ascending=False).drop_duplicates('Gene stable ID').sort_index()
uniparc_mappings_longest.set_index('Gene stable ID',inplace=True)
print(uniparc_mappings_longest.shape)
uniparc_mappings_longest.head()

(8625, 9)


Unnamed: 0_level_0,Gene name,UniParc ID,UniProtKB/Swiss-Prot ID,UniProtKB/TrEMBL ID,protein_sequence,avg_embedding,final_hidden,final_cell,seq_length
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ENSMUSG00000000028,Cdc45,UPI0000001A56,Q9Z1X9,Q3UI99,MFVTDFRKEFYETVHNQRVLLFVASDVDALCACKILQALFQCDHVQ...,"[-0.05150509, 0.09288624, -0.12802534, -0.9785...","[-0.043791275, 0.048134997, -0.34964657, -0.96...","[-0.15412268, 0.26008302, -0.52376455, -5.8684...",566
ENSMUSG00000000056,Narf,UPI00000275F5,Q9CYQ7,,MKCEHCTRKECSKKSKTDDQENVSSDGAQPSDGASPAKESEEKGEF...,"[-0.075725555, 0.10475193, -0.09333034, -0.977...","[0.04253189, 0.10435928, -0.089505516, -0.9834...","[0.08490218, 0.5082729, -0.1949954, -7.1791, -...",462
ENSMUSG00000000058,Cav2,UPI00000019C0,Q9WVC3,,MGLETEKADVQLFMADDAYSHHSGVDYADPEKYVDSSHDRDPHQLN...,"[-0.047078904, 0.11453671, 0.027847176, -0.965...","[-0.0141451955, 0.08090255, 0.04514965, -0.980...","[-0.036824368, 0.3290622, 0.0523856, -3.024334...",162
ENSMUSG00000000078,Klf6,UPI00000EB051,,Q8BPQ2,MKLSPALPGTVSARTPDRSPPCFPDSEDCLFQPDMDVLPMCSIFQE...,"[-0.068638414, 0.087828696, -0.08027547, -0.91...","[-0.93870264, 0.6842919, -0.61148405, 0.006266...","[-1.7835155, 0.8387107, -1.1894798, 0.00626751...",318
ENSMUSG00000000085,Scmh1,UPI00000E6638,Q8K214,,MLVCYSVLACESLWDLPCSIMGSPLGHFTWDKYLKETCSVPAPVHC...,"[-0.008291963, 0.0670462, -0.09555821, -0.9706...","[-0.038280506, 0.0846687, -0.088760465, -0.979...","[-0.081338465, 0.35983515, -0.21546814, -6.956...",706


In [198]:
#uniparc_mappings_longest.to_csv('../../Data/missing_proteins/uniparc_embedding_mapping_longest.tsv',sep='\t')
# with open('../../Data/missing_proteins/uniparc_embedding_mapping_longest.pkl', 'wb') as file:
#     pickle.dump(uniparc_mappings_longest, file, protocol=pickle.HIGHEST_PROTOCOL)

In [83]:
#somehow I lost genes from biomart -> no proteins associated

In [199]:
missing_genes_biomart = set(rna_seq.index) - set(uniparc_mappings_longest.index)

In [200]:
len(missing_genes_biomart)

1850

In [98]:
#missing_genes_biomart

{'ENSMUSG00000061331',
 'ENSMUSG00000086289',
 'ENSMUSG00000103364',
 'ENSMUSG00000045391',
 'ENSMUSG00000101013',
 'ENSMUSG00000103276',
 'ENSMUSG00000055968',
 'ENSMUSG00000085175',
 'ENSMUSG00000091479',
 'ENSMUSG00000086786',
 'ENSMUSG00000091192',
 'ENSMUSG00000097404',
 'ENSMUSG00000093553',
 'ENSMUSG00000096976',
 'ENSMUSG00000085707',
 'ENSMUSG00000071362',
 'ENSMUSG00000085438',
 'ENSMUSG00000102336',
 'ENSMUSG00000097448',
 'ENSMUSG00000097375',
 'ENSMUSG00000074292',
 'ENSMUSG00000057359',
 'ENSMUSG00000097330',
 'ENSMUSG00000100747',
 'ENSMUSG00000103009',
 'ENSMUSG00000097466',
 'ENSMUSG00000085977',
 'ENSMUSG00000103761',
 'ENSMUSG00000096947',
 'ENSMUSG00000098120',
 'ENSMUSG00000097861',
 'ENSMUSG00000086507',
 'ENSMUSG00000092564',
 'ENSMUSG00000101462',
 'ENSMUSG00000087535',
 'ENSMUSG00000054957',
 'ENSMUSG00000097383',
 'ENSMUSG00000097634',
 'ENSMUSG00000086680',
 'ENSMUSG00000103486',
 'ENSMUSG00000086347',
 'ENSMUSG00000074930',
 'ENSMUSG00000086890',
 'ENSMUSG00

In [143]:
cell_lines = ['AT2_04M_F0', 'AT2_04M_F10', 'AT2_18M_F0', 'AT2_18M_F10', 'AM_04M_F0', 'AM_04M_F10', 'AM_18M_F0', 'AM_18M_F10']

In [233]:
rna_missing_protein_u64embeddings = dict()

for cell in cell_lines:
    temp = rna_seq[rna_seq[cell] != 0][['Gene.names','AvgChrs',cell]]
    temp = temp.join(uniparc_mappings_longest[['UniParc ID','seq_length']]).dropna()
    temp['seq_length'] = temp['seq_length'].astype('int64')
    rna_missing_protein_u64embeddings[cell] = temp.join(pd.DataFrame(
        uniparc_mappings_longest.avg_embedding.values.tolist(),index = uniparc_mappings_longest.index))

rna_missing_protein_u64embeddings[cell]

Unnamed: 0_level_0,Gene.names,AvgChrs,AM_18M_F10,UniParc ID,seq_length,0,1,2,3,4,...,54,55,56,57,58,59,60,61,62,63
Row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000028,Cdc45,15.191392,3.696447,UPI0000001A56,566,-0.051505,0.092886,-0.128025,-0.978530,-0.048970,...,0.109792,0.017838,-0.021872,-0.024457,-0.006590,0.286516,-0.093648,-0.034798,0.342300,0.017359
ENSMUSG00000000056,Narf,10.993152,3.913531,UPI00000275F5,462,-0.075726,0.104752,-0.093330,-0.977975,-0.014816,...,0.136652,0.015657,-0.018869,-0.017616,-0.031201,0.307665,-0.099087,-0.043073,0.307467,0.001384
ENSMUSG00000000058,Cav2,5.115437,6.755340,UPI00000019C0,162,-0.047079,0.114537,0.027847,-0.965120,-0.013341,...,0.120900,0.017751,0.044818,-0.008024,-0.062441,0.394191,-0.095504,-0.033056,0.496792,0.000322
ENSMUSG00000000078,Klf6,12.048712,8.720886,UPI00000EB051,318,-0.068638,0.087829,-0.080275,-0.913204,-0.021360,...,0.195960,0.035324,0.076005,-0.015016,-0.077865,0.240104,-0.097236,-0.056610,0.237513,0.011398
ENSMUSG00000000085,Scmh1,3.769722,3.858996,UPI00000E6638,706,-0.008292,0.067046,-0.095558,-0.970626,-0.036182,...,0.155060,0.019779,0.104153,-0.067082,-0.044124,0.193846,-0.081212,-0.044682,0.305639,-0.008415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000102037,Bcl2a1a,8.713990,5.416032,UPI0000027C48,172,-0.027766,0.107253,-0.119918,-0.975637,-0.049565,...,0.090784,0.015742,-0.002449,-0.043287,-0.004694,0.395465,-0.116766,-0.038466,0.394528,0.013917
ENSMUSG00000102418,Sh2d1b1,0.871133,7.558523,UPI0000022991,132,-0.061424,0.101789,-0.106160,-0.974987,-0.033538,...,0.133532,0.014962,-0.016003,0.000221,-0.034554,0.346900,-0.062211,-0.034270,0.397579,0.004446
ENSMUSG00000102918,Pcdhgc3,17.417012,2.051641,UPI0000020FEB,934,0.100940,0.208059,-0.142799,-0.935833,-0.047764,...,0.126259,0.026332,0.083229,-0.023184,-0.034593,0.420188,-0.281215,-0.058276,0.543674,-0.031979
ENSMUSG00000103034,Gm8797,2.035935,1.111484,UPI0000195AD5,77,-0.042151,0.120810,-0.116392,-0.956629,-0.018843,...,0.083900,0.007147,-0.083582,-0.038512,-0.022944,0.606860,-0.185181,-0.062284,0.553100,0.009470


## Save missing protein genes feature tables

In [234]:
with open('../../Data/missing_proteins/rna_missing_protein_u64embeddings.pkl','wb') as file:
    pickle.dump(rna_missing_protein_u64embeddings,file,protocol=pickle.HIGHEST_PROTOCOL)

200	OK	Request was a success. Only process data from the service when you receive this code

400	Bad Request	Occurs during exceptional circumstances such as the service is unable to find an ID. Check if the response Content-type or Accept was JSON. If so the JSON object is an exception hash with the message keyed under error

In [6]:
# server = "https://rest.ensembl.org"
# missing_sequences = []
# rna_seq['ensemble_protein_name'] = np.nan
# rna_seq['protein_sequence'] = np.nan

# tqdm.tqdm._instances.clear()
# for gene in tqdm.tqdm(rna_seq.index):
#     ext = "/sequence/id/"+gene+"?type=protein;multiple_sequences=1"
#     r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
    
#     if not r.ok:
#         missing_sequences.append(gene+str(r.status_code))
#         continue
    
#     seqs = dict()
#     for line in r.content.decode("utf-8").split('\n'):
#         if '>' in line:
#             ens_prot = line[1::]
#             seqs[ens_prot] = ''
#         else:
#             seqs[ens_prot] += line

#     longest_protein = max(seqs, key=lambda k: len(seqs[k]))
#     rna_seq.loc[gene,['ensemble_protein_name','protein_sequence']] = [longest_protein,seqs[longest_protein]]
        
        
        
        
