In [16]:
import os

import pandas as pd

import preprocess
import uniprot_id_mapping

In [17]:
id_mapping_file = 'uniprot_id_to_gene_name_mapping.tsv'

In [5]:
df = pd.read_csv('mapped_protein_matrix_maxlfq_diann-normalised.csv', index_col=0)

In [22]:
if not os.path.isfile(id_mapping_file):
    id_mapping_df = uniprot_id_mapping.get_id_mapping(
        df.index.to_list()
    ) # 216 ids could not be mapped
    id_mapping_df.to_csv(id_mapping_file, sep='\t', index=False)
else:
    id_mapping_df = pd.read_csv(id_mapping_file, sep='\t')

In [23]:
id_mapping_df

Unnamed: 0,from,to
0,Q9Y651,SOX21
1,P37108,SRP14
2,Q96JP5,ZFP91
3,Q9Y4H2,IRS2
4,P36578,RPL4
...,...,...
8527,Q6P2I3,FAHD2B
8528,P17098,ZNF8
8529,Q9P0U3,SENP1
8530,Q9BQB6,VKORC1


In [27]:
df.index = df.index.map(id_mapping_df.set_index('from')['to'].to_dict())

In [29]:
df = preprocess.log_transform(df)

In [30]:
normalized_df = preprocess.median_centering_ms1(df)
filtered_df = preprocess.filter_by_min_occurrence(normalized_df)
imputed_df = preprocess.impute_normal_down_shift_distribution(filtered_df)
imputed_df

Unnamed: 0,CTV-1,MLMA,TT,NCI-H1092,Control_HEK293T_lys,Control_HEK293T_std_H003,SU-DHL-8,NTERA-2-cl-D1,Control_HEK293T_std_H003.1,EM-2,...,BICR31.3,BICR31.4,Unnamed: 6973,Unnamed: 6974,Unnamed: 6975,BICR31.5,Unnamed: 6977,Unnamed: 6978,Unnamed: 6979,Unnamed: 6980
SRP14,2.298645,2.338599,1.721867,2.029757,1.960867,2.109911,2.151601,1.707170,2.103511,2.013924,...,1.297382,0.921296,1.452457,1.508915,1.655746,1.488786,0.841062,0.937060,0.920304,1.065251
ZFP91,1.072860,1.042830,0.203434,1.363873,1.048346,0.805044,1.034554,0.477473,0.388548,0.540722,...,0.704774,0.622666,0.893578,0.689930,0.785048,0.615969,0.899882,0.908681,1.006767,1.174233
RPL4,2.296261,2.016385,1.663299,1.627230,2.235037,2.821218,1.924171,2.052656,2.806736,2.237999,...,2.409620,2.177788,1.568922,1.718313,0.688867,2.277753,1.889455,0.795937,0.925275,1.130050
SAMD1,0.668612,0.894370,1.165999,1.358382,-0.357065,0.381408,1.394462,1.036027,0.255816,0.568356,...,0.734284,0.543014,0.814807,0.823931,0.767044,0.655698,0.892307,0.899478,1.042630,1.122388
CLPX,1.347261,1.496428,1.325176,1.285406,1.037526,1.128138,0.875044,1.350616,1.193732,1.309786,...,1.147567,1.142271,0.857510,1.222387,0.931952,1.045137,1.021856,0.955630,1.095432,1.072847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TUBB4B,2.233043,2.332621,1.404208,2.995614,2.554207,2.560884,2.551375,2.620994,2.631255,2.366648,...,1.690960,1.951911,1.800387,2.011762,1.604681,1.989926,1.611646,1.834929,0.950823,1.185434
TUBB2B,0.972872,1.252483,0.960300,2.314310,1.429504,1.955449,1.123166,2.115894,2.101597,0.629333,...,0.348273,0.759728,0.948508,1.349857,1.016428,0.766400,0.833810,0.963912,1.215517,1.023631
RAB15,1.811104,1.629472,2.360234,2.148633,1.699984,2.129533,1.382295,2.355661,2.179468,1.768763,...,2.138804,1.881134,1.778361,0.669049,0.761088,1.999713,0.833999,0.784092,0.952987,1.109433
EML6,2.039957,2.108112,0.426092,0.592975,2.064934,2.509514,0.555657,1.413265,2.487171,2.289021,...,0.395852,1.886288,0.883352,0.684148,0.733142,0.724872,1.093500,1.080747,1.085963,1.127159


In [31]:
imputed_df.to_csv('protein_expression_table.tsv', sep='\t')