In [1]:
import ensembl_rest
import numpy as np
import pandas as pd

In [11]:
ensembl_annotation = pd.read_csv('../data/custom/ensembl_annotation_trs_uniprot_20220429.csv', low_memory=False)

In [None]:
with open('../data/raw/nextprot_050222_missing_pe2_20220409.txt') as f:
    missing_ac_list = f.read().splitlines()

In [3]:
len(missing_ac_list)

1135

In [8]:
missing_df = pd.concat([ensembl_annotation[ensembl_annotation['uniprot_base']==ac] for ac in missing_ac_list])

unmapped = list(set(missing_ac_list).difference(missing_df['uniprot_base'].to_list()))
unmapped.sort()

with open('../reports/mapping/missing_ensembl_unmapped_20220411.txt', 'w') as f:
    for i in unmapped:
        f.write(f'{i}\n')

In [5]:
len(unmapped)

237

In [9]:
def get_length_aa(enst_list: list):
    length_aa = []
    enst_info = {}
    
    chunk_size = 100
    enst_chunks = (enst_list[pos:pos + chunk_size] for pos in range(0, len(enst_list), chunk_size))
    
    for i in enst_chunks:
        enst_info.update(ensembl_rest.lookup_post(species='human', params={'expand': True, 'ids': i}))
    
    for e in enst_list:
        try:
            length_aa.append(enst_info[e]['Translation']['length'])
        except TypeError:
            length_aa.append(np.nan)
        
    return length_aa

missing_df['protein_length_aa'] = get_length_aa(missing_df['ensembl_trs_id'].to_list())

In [14]:
missing_df = missing_df[['ensembl_gene_name', 'uniprot_isoform', 'protein_length_aa', 
                         'ensembl_trs_id', 'ensembl_is_canonical', 'trs_length_bp']]

missing_df.to_csv('../data/processed/missing_ensembl_mapping_all_20220411.csv', index=False)

In [13]:
missing_df

Unnamed: 0,ensembl_gene_name,uniprot_isoform,protein_length_aa,ensembl_trs_id,ensembl_is_canonical,trs_length_bp
214034,TRBV24-1,A0A075B6N3-1,115,ENST00000390397,True,381
214037,TRBV25-1,A0A075B6N4-1,114,ENST00000390398,True,381
213937,TRAV6,A0A075B6T7-1,132,ENST00000390428,True,404
213938,TRAV7,A0A075B6U4-1,112,ENST00000390429,True,337
213928,TRAV36DV7,A0A075B6V5-1,113,ENST00000390463,True,356
...,...,...,...,...,...,...
144711,OR51B4,Q9Y5P0-1,310,ENST00000380224,True,933
173939,RNF215,Q9Y6U7-1,377,ENST00000382363,True,2011
129869,MTRNR2L13,S4R3P1-1,24,ENST00000604093,True,1445
129867,MTRNR2L11,S4R3Y5-1,24,ENST00000604646,True,1552


In [2]:
line_names = ['A549', 'Caco2', 'HaCaT', 'Hek293T', 'HeLa', 'HepG2', 'Huh7', 'MCF7', 'Saos2', 'SKBR3', 'U2OS']

In [4]:
line_files = [pd.read_excel(f'../data/custom/va_lines/{i}.xlsx')
              .set_index('transcript_id', drop=True).iloc[:, [-1]] 
              for i in line_names]

In [5]:
lines_df = line_files[0].join(line_files[1:], how='outer')

In [6]:
lines_df.index.name= 'ensembl_trs_id'

In [7]:
lines_df = lines_df.fillna(0)

In [9]:
lines_df.to_csv('../data/processed/va_lines_expression_proteincoding_trs.csv')

In [13]:
lines_df = pd.read_csv('../data/processed/va_lines_expression_proteincoding_trs.csv', index_col=0)

In [14]:
lines_df

Unnamed: 0_level_0,A549,Caco2,HaCaT,HEK293T,HeLa,HepG2,Huh7,MCF7,SAOS2,SKBR3,U2OS
ensembl_trs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENST00000440075,0.256721,9.788971,0.407386,0.000000,0.000000,0.159880,0.065040,0.000000,0.000000,0.184198,0.000000
ENST00000398101,0.433563,0.367757,0.000000,0.058994,0.021996,0.000000,0.222245,0.066869,1.224826,1.930838,3.093246
ENST00000261219,0.175636,0.417713,0.674782,0.068071,0.000000,0.154386,0.116461,0.293679,0.791766,1.853371,0.643787
ENST00000537360,0.548286,0.265523,0.044893,0.038378,0.054987,0.070682,0.033041,0.000000,0.108896,0.211980,0.049342
ENST00000367067,1.626066,1.568156,0.634107,0.000000,27.164890,0.620189,0.172326,0.980916,2.233141,1.346990,1.843781
...,...,...,...,...,...,...,...,...,...,...,...
ENST00000674031,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.531759
ENST00000674194,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039345
ENST00000675459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021859
ENST00000675842,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007865


In [15]:
lines_df.columns = [f'{i.lower()}_tpm' for i in lines_df.columns]

In [16]:
lines_df

Unnamed: 0_level_0,a549_tpm,caco2_tpm,hacat_tpm,hek293t_tpm,hela_tpm,hepg2_tpm,huh7_tpm,mcf7_tpm,saos2_tpm,skbr3_tpm,u2os_tpm
ensembl_trs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ENST00000440075,0.256721,9.788971,0.407386,0.000000,0.000000,0.159880,0.065040,0.000000,0.000000,0.184198,0.000000
ENST00000398101,0.433563,0.367757,0.000000,0.058994,0.021996,0.000000,0.222245,0.066869,1.224826,1.930838,3.093246
ENST00000261219,0.175636,0.417713,0.674782,0.068071,0.000000,0.154386,0.116461,0.293679,0.791766,1.853371,0.643787
ENST00000537360,0.548286,0.265523,0.044893,0.038378,0.054987,0.070682,0.033041,0.000000,0.108896,0.211980,0.049342
ENST00000367067,1.626066,1.568156,0.634107,0.000000,27.164890,0.620189,0.172326,0.980916,2.233141,1.346990,1.843781
...,...,...,...,...,...,...,...,...,...,...,...
ENST00000674031,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.531759
ENST00000674194,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039345
ENST00000675459,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.021859
ENST00000675842,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007865


In [18]:
missing_df = pd.read_csv('../data/processed/missing_ensembl_mapping_all_20220411.csv')

In [19]:
missing_df = missing_df.join(lines_df, on='ensembl_trs_id', how='left').fillna(0)

In [21]:
missing_df.to_excel('../reports/mapping/missing_ensembl_expression_all_lines_20220607.xlsx', index=False)

In [2]:
hek293t_repl = [pd.read_csv(f'../data/custom/va_hek293t/HEK293T_{i}.sf', sep='\t')
                .set_index('Name', drop=True)
                .loc[:, ['TPM']]
                .rename({'TPM':f'HEK293T-{i}'}, axis=1) 
                for i in range(1,4)]

In [3]:
for i in hek293t_repl:
    i.index = i.index.map(lambda x: x[:x.index('.')])

In [4]:
hek293t_df = hek293t_repl[0].join(hek293t_repl[1:])

In [5]:
hek293t_df['TPM'] = hek293t_df.mean(axis=1)

In [6]:
hek293t_df.index.name = 'ensembl_trs_id'

In [7]:
hek293t_df.to_csv('../data/processed/va_hek293t_expression_all_trs.csv')

In [19]:
hek293t_df = pd.read_csv('../data/processed/va_hek293t_expression_all_trs.csv', index_col=0)

In [8]:
hek293t_repl[0].shape

(196722, 1)

In [9]:
hek293t_repl[1].shape

(196722, 1)

In [10]:
hek293t_repl[2].shape

(196722, 1)

In [41]:
missing_expression_good_sum = missing_df.groupby('uniprot_isoform')['hek293t_tpm'].sum() > 4
missing_expression_good_sum = missing_expression_good_sum[missing_expression_good_sum == True].index

missing_select_df = missing_df[(missing_df['protein_length_aa'] > 80) & (missing_df['uniprot_isoform'].isin(missing_expression_good_sum))]

missing_select_df = missing_select_df.sort_values(['ensembl_gene_name', 'uniprot_isoform'])

missing_select_df = missing_select_df.set_index(['ensembl_gene_name', 'uniprot_isoform', 'protein_length_aa', 'ensembl_trs_id'], drop=True)

missing_select_df = missing_select_df.loc[:, ['ensembl_is_canonical', 'trs_length_bp', 'hek293t_tpm']]

missing_select_df.to_excel('../reports/selection/missing_select_hek_20220607.xlsx')

In [42]:
missing_select_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ensembl_is_canonical,trs_length_bp,hek293t_tpm
ensembl_gene_name,uniprot_isoform,protein_length_aa,ensembl_trs_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AGAP9,Q5VTM2-2,658,ENST00000452145,True,2387,17.906937
ASAH2B,P0C7U1-1,165,ENST00000643851,False,5005,1.375991
ASAH2B,P0C7U1-1,165,ENST00000374006,False,632,3.907073
ASAH2B,P0C7U1-2,160,ENST00000374007,False,3715,3.989175
ASAH2B,P0C7U1-2,160,ENST00000647317,True,5145,2.919516
C1orf54,Q8WWF1-1,131,ENST00000369102,False,1251,0.369875
C1orf54,Q8WWF1-1,131,ENST00000369099,True,509,9.431817
CNPY1,Q3B7I2-1,92,ENST00000406197,False,560,19.87163
CNPY1,Q3B7I2-1,92,ENST00000321736,False,2378,8.023424
CTXN1,P60606-1,82,ENST00000318978,True,1237,9.340649


In [23]:
missing_df[(missing_df['protein_length_aa'] > 80) & (missing_df['hek293t_tpm'] > 4)]

Unnamed: 0,ensembl_gene_name,uniprot_isoform,protein_length_aa,ensembl_trs_id,ensembl_is_canonical,trs_length_bp,a549_tpm,caco2_tpm,hacat_tpm,hek293t_tpm,hela_tpm,hepg2_tpm,huh7_tpm,mcf7_tpm,saos2_tpm,skbr3_tpm,u2os_tpm
278,GOLGA8A,A7E2F4-1,631,ENST00000432566,False,4577,0.057092,1.461287,1.214195,5.411162,19.582925,1.519552,4.410374,0.944994,14.366107,0.995823,0.494476
279,GOLGA8A,A7E2F4-3,603,ENST00000359187,True,5777,0.0,1.638568,3.221302,5.787899,10.371453,2.729271,4.621757,0.244215,3.641828,0.0,3.331525
337,FAM72C,H0Y354-1,149,ENST00000584486,True,2395,6.043067,0.661529,5.208124,10.061995,7.204449,4.056322,0.540621,1.10115,28.592691,4.806708,9.141472
578,CTXN1,P60606-1,82,ENST00000318978,True,1237,20.65808,2.203673,6.683739,9.340649,19.286514,1.021377,5.117358,1.525792,9.222452,3.956297,57.701609
626,LRRC75B,Q2VPJ9-1,315,ENST00000318753,True,1233,3.238652,0.0,0.659198,4.676983,0.520652,2.624748,1.296385,0.069811,0.235021,0.0,1.438137
635,CNPY1,Q3B7I2-1,92,ENST00000406197,False,560,0.1013,0.0,0.0,19.87163,0.0,0.0,0.0,0.0,0.0,0.0,0.067358
636,CNPY1,Q3B7I2-1,92,ENST00000321736,False,2378,0.00615,0.067968,0.0,8.023424,0.0,0.0,0.0,0.012038,0.011481,0.0,0.062071
666,GAGE13,Q4V321-1,117,ENST00000612958,True,561,0.0,0.020393,0.0,5.47427,0.0,0.0,0.0,0.0,0.55824,0.0,6.362945
704,NBPF12,Q5TAG4-1,1457,ENST00000617931,True,7061,0.718152,0.752975,0.409628,4.27793,0.912415,1.059471,1.113211,2.444306,5.744109,3.52098,2.556022
705,NBPF12,Q5TAG4-1,1457,ENST00000617844,False,6326,0.994998,0.73618,0.774347,4.9932,0.807365,1.123835,1.519643,3.311536,8.377505,2.686668,2.402899


In [5]:
missing_select_df = pd.read_excel('../reports/selection/missing_select_hek_20220607.xlsx', index_col=[0,1,2,3])

In [9]:
missing_select_df.index.get_level_values('ensembl_gene_name').unique()

Index(['AGAP9', 'ASAH2B', 'C1orf54', 'CNPY1', 'CTXN1', 'CYB5RL', 'EOLA2',
       'FAM72C', 'GAGE13', 'GOLGA8A', 'LRRC75B', 'MOSMO', 'NBPF11', 'NBPF12',
       'NBPF15', 'NPIPA1', 'PPM1N', 'RNF215', 'TLCD5', 'TMEM81', 'TSPAN17'],
      dtype='object', name='ensembl_gene_name')

In [13]:
missing_select_df.index['ensembl_gene_name'].unique()

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
all_trs = ensembl_annotation[ensembl_annotation['ensembl_gene_name'].isin(missing_select_df.index.get_level_values('ensembl_gene_name').unique())]

In [20]:
all_trs['hek293_tpm'] = all_trs['ensembl_trs_id'].map(hek293t_df['TPM'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_trs['hek293_tpm'] = all_trs['ensembl_trs_id'].map(hek293t_df['TPM'])


In [23]:
all_trs = all_trs.drop(['ensembl_gene_id', 'trs_length_bp', 'ensembl_protein_id', 'uniprot_base'], axis=1)

In [24]:
all_trs

Unnamed: 0,ensembl_gene_name,ensembl_trs_id,ensembl_is_canonical,trs_type,uniprot_isoform,hek293_tpm
5288,AGAP9,ENST00000452145,True,protein_coding,Q5VTM2-2,17.906937
13420,ASAH2B,ENST00000643851,False,protein_coding,P0C7U1-1,1.375991
13421,ASAH2B,ENST00000374006,False,protein_coding,P0C7U1-1,3.907073
13422,ASAH2B,ENST00000374007,False,protein_coding,P0C7U1-2,3.989175
13423,ASAH2B,ENST00000647317,True,protein_coding,P0C7U1-2,2.919516
...,...,...,...,...,...,...
216480,TSPAN17,ENST00000514705,False,retained_intron,,2.486381
216481,TSPAN17,ENST00000503030,False,nonsense_mediated_decay,,0.000000
216482,TSPAN17,ENST00000504168,False,protein_coding,,0.483052
216483,TSPAN17,ENST00000503045,False,protein_coding,,1.833010


In [25]:
def get_enst_name(enst_list: list):
    length_aa = []
    enst_info = {}
    
    chunk_size = 100
    enst_chunks = (enst_list[pos:pos + chunk_size] for pos in range(0, len(enst_list), chunk_size))
    
    for i in enst_chunks:
        enst_info.update(ensembl_rest.lookup_post(species='human', params={'expand': True, 'ids': i}))
    
    for e in enst_list:
        try:
            length_aa.append(enst_info[e]['display_name'])
        except TypeError:
            length_aa.append(np.nan)
        
    return length_aa

In [26]:
all_trs['ensembl_trs_name'] = get_enst_name(all_trs['ensembl_trs_id'].to_list())

In [29]:
all_trs = all_trs[all_trs.columns[:2].to_list() + ['ensembl_trs_name'] + all_trs.columns[2:-1].to_list()]

In [36]:
all_trs = all_trs.set_index(['ensembl_gene_name', 'ensembl_trs_id'], drop=True)

KeyError: "None of ['ensembl_gene_name', 'ensembl_trs_id'] are in the columns"

In [38]:
all_trs.to_excel('../reports/selection/missing_select_hek_all_trs_20220607.xlsx')

In [35]:
all_trs

Unnamed: 0_level_0,Unnamed: 1_level_0,ensembl_trs_name,ensembl_is_canonical,trs_type,uniprot_isoform,hek293_tpm
ensembl_gene_name,ensembl_trs_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AGAP9,ENST00000452145,AGAP9-201,True,protein_coding,Q5VTM2-2,17.906937
ASAH2B,ENST00000643851,ASAH2B-205,False,protein_coding,P0C7U1-1,1.375991
ASAH2B,ENST00000374006,ASAH2B-201,False,protein_coding,P0C7U1-1,3.907073
ASAH2B,ENST00000374007,ASAH2B-202,False,protein_coding,P0C7U1-2,3.989175
ASAH2B,ENST00000647317,ASAH2B-207,True,protein_coding,P0C7U1-2,2.919516
...,...,...,...,...,...,...
TSPAN17,ENST00000514705,TSPAN17-208,False,retained_intron,,2.486381
TSPAN17,ENST00000503030,TSPAN17-203,False,nonsense_mediated_decay,,0.000000
TSPAN17,ENST00000504168,TSPAN17-205,False,protein_coding,,0.483052
TSPAN17,ENST00000503045,TSPAN17-204,False,protein_coding,,1.833010


In [39]:
class GenRange:
    def __init__(self, start, stop):
        self.start = start
        self.stop = stop

In [None]:
class GenRangeAgg:
    def __init__(self, )