In [1]:
import pickle
import numpy as np
import pandas as pd
import sys

#Location of ppg data class
sys.path.append('../Utils')
from data_class import GPY_data

#Protein names
PPG_PROT_NAMES = '../../Data/ProcessedData/uniprot.txt'

#unirep 64 embeddings
PPG_EMBEDDINGS = '../../Data/ProcessedData/protein_embeddings/unirep64_ppg.tsv'

#PPG Original Data
PPG_AM_AT2 = '../../Data/ProcessedData/GP_data.tsv'

#AM and AT2 Gene to Protein Mappings
PPG_GENE2PROTEIN_AM = '../../Data/ProcessedData/AM_SeqMappings.tsv'
PPG_GENE2PROTEIN_AT2 = '../../Data/ProcessedData/AT2_SeqMappings.tsv'

#### Filtered data using data class from gp
* Removes zero rows etc, description below

In [2]:
gp_data = pd.read_csv(PPG_AM_AT2,sep='\t',index_col=1)
gp_data.drop(columns=['Row.names', 'Gene.description'],inplace=True)

features = ['AvgChrs','NormPosition','Gene Length']
AT2 = ['AT2_04M_F0','AT2_04M_F10','AT2_18M_F0','AT2_18M_F10']
AM = ['AM_04M_F0','AM_04M_F10','AM_18M_F0','AM_18M_F10']
cell_lines = AT2+AM

data = GPY_data(gp_data,features,cell_lines) #Normalizes gene length

* self.rna = {} #Dictionary containing dataframe for cell conditions with rows of genes of nonzero mRNA levels
* self.rna_chrm = {} #Dictinoary of data frames filtered for each chromosome, contains all genes with nonzero mRNA
* self.rna_protein = {} #Dictionary containing dataframe for cell conditions with rows of genes of nonzero mRNA and protein levels
* self.rna_protein_chrm = {} #Data frames filtered for each chromosome, contains only nonzero mRNA and protein

#### List of protein names

In [3]:
with open(PPG_PROT_NAMES, 'r') as f:
    ppg_prot_names = f.read().splitlines()

#### Gene to protein mappings for AT2 and AM cell lines

In [4]:
AM_g2p = pd.read_csv(PPG_GENE2PROTEIN_AM,sep='\t',usecols=[0,2,4],index_col=0)
AT2_g2p = pd.read_csv(PPG_GENE2PROTEIN_AT2,sep='\t',usecols=[0,2,4],index_col=0)

In [5]:
print(AM_g2p.shape)
AM_g2p.head()

(4017, 2)


Unnamed: 0_level_0,Majority.protein.IDs,ProteinLength
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1
Rhog,P84096,191
Atp8a1,P70704,1164
Pecr,Q99MZ7,303
Pon3,Q62087,354
Aars,Q8BGQ7,968


In [6]:
print(AT2_g2p.shape)
AT2_g2p.head()

(3866, 2)


Unnamed: 0_level_0,Majority.protein.IDs,ProteinLength
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1
Rhog,P84096,191
Pign,Q9R1S3,931
Atp8a1,P70704,1164
Pecr,Q99MZ7,303
Pon3,Q62087,354


#### Unirep protein embeddings64

In [7]:
unirep_embeddings64 = pd.read_csv(PPG_EMBEDDINGS,sep='\t',header=None)
unirep_embeddings64.index=ppg_prot_names
print(unirep_embeddings64.shape)
unirep_embeddings64.head()

(4870, 64)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
Q9JKL4,0.023904,0.130526,-0.056297,-0.982801,-0.017489,-0.155955,0.057548,0.160403,0.319312,-0.08157,...,0.118616,0.012999,0.057938,0.01019,-0.039859,0.363125,-0.041358,-0.041479,0.370437,-0.015415
Q80YW5,-0.016018,0.125149,-0.093761,-0.983654,-0.068512,-0.118433,0.051009,0.113049,0.275472,-0.173638,...,0.147999,0.027418,0.142981,-0.081006,0.000104,0.333871,-0.126173,-0.015118,0.185681,0.040073
Q8JZM0,-0.034827,0.108697,-0.126441,-0.969673,-0.032481,-0.133307,0.074118,0.197836,0.384059,-0.122462,...,0.09114,0.010459,-0.019975,-0.059308,-0.03297,0.32716,-0.138838,-0.041891,0.390695,-0.004244
Q9D1J1,-0.007833,0.135641,-0.125404,-0.962004,-0.026745,-0.269026,0.055999,0.1879,0.313063,-0.320851,...,0.133229,0.007293,0.050058,-0.046334,-0.022446,0.351571,-0.224647,-0.040777,0.39051,-0.029492
P97429,-0.068235,0.112498,-0.157972,-0.96932,-0.034266,-0.064829,0.08006,0.1526,0.414578,-0.092778,...,0.080221,0.017138,-0.062185,-0.052695,0.033959,0.389016,-0.195046,-0.022244,0.384892,0.008379


Once all the reference dataframes are loaded it is a simple dataframe merge to get all the information together. Information needed is:
* Gene to protein mapping for specific cell lines
* All the different mRNA and protein AUC tables
* Unirep protein embeddings

#### Example dataframe merge

In [8]:
print(data.rna_protein['AT2_04M_F0'].shape)
data.rna_protein['AT2_04M_F0'].head()

(3541, 5)


Unnamed: 0_level_0,AvgChrs,NormPosition,Gene Length,AT2_04M_F0,AT2_04M_F0_P
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gnai3,2.675624,0.675624,15.246221,7.151432,25.38975
Cox5a,8.46171,0.46171,13.445015,5.426985,29.67618
Dlat,8.406494,0.406494,14.618099,5.49682,26.98208
Gpr107,1.171243,0.171243,16.020177,6.244449,22.90228
Trim25,10.729096,0.729096,14.352388,6.358683,24.415022


In [9]:
data.rna_protein['AT2_04M_F0'].merge(AT2_g2p,left_index=True, right_index=True).head()

Unnamed: 0_level_0,AvgChrs,NormPosition,Gene Length,AT2_04M_F0,AT2_04M_F0_P,Majority.protein.IDs,ProteinLength
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Gnai3,2.675624,0.675624,15.246221,7.151432,25.38975,Q9DC51,354
Cox5a,8.46171,0.46171,13.445015,5.426985,29.67618,P12787,146
Dlat,8.406494,0.406494,14.618099,5.49682,26.98208,Q8BMF4,642
Gpr107,1.171243,0.171243,16.020177,6.244449,22.90228,Q8BUV8,551
Trim25,10.729096,0.729096,14.352388,6.358683,24.415022,Q61510,634


In [10]:
data.rna_protein['AT2_04M_F0'].merge(AT2_g2p,left_index=True, right_index=True).merge(unirep_embeddings64,left_on='Majority.protein.IDs',left_index=False,right_index=True).head()

Unnamed: 0_level_0,AvgChrs,NormPosition,Gene Length,AT2_04M_F0,AT2_04M_F0_P,Majority.protein.IDs,ProteinLength,0,1,2,...,54,55,56,57,58,59,60,61,62,63
Gene.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gnai3,2.675624,0.675624,15.246221,7.151432,25.38975,Q9DC51,354,-0.123134,0.126303,-0.092475,...,0.137047,0.008167,-0.088351,-0.018563,-0.036717,0.443771,-0.123508,-0.03699,0.424119,0.014253
Cox5a,8.46171,0.46171,13.445015,5.426985,29.67618,P12787,146,0.01435,0.121865,-0.062778,...,0.11858,0.009217,0.136575,-0.059452,0.019777,0.42325,-0.06335,-0.025512,0.384949,0.015145
Dlat,8.406494,0.406494,14.618099,5.49682,26.98208,Q8BMF4,642,-0.026584,0.109657,-0.081645,...,0.138321,0.016578,0.122076,-0.143501,0.013228,0.302376,-0.168613,-0.036329,0.351341,-0.014426
Gpr107,1.171243,0.171243,16.020177,6.244449,22.90228,Q8BUV8,551,-0.02733,0.104179,0.021283,...,0.127384,0.011998,0.039601,-0.03098,-0.080105,0.244611,-0.092342,-0.039188,0.482943,-0.018493
Trim25,10.729096,0.729096,14.352388,6.358683,24.415022,Q61510,634,-0.071294,0.072918,-0.114657,...,0.156678,0.041194,0.020387,-0.059393,-0.025968,0.263505,-0.113366,-0.038602,0.264744,0.012638


### Actual Data Frame Merge and undo log2(x+1) transformation

In [11]:
rna_protein_embeddings = {}
for cell in cell_lines:
    if cell[0:3] == 'AT2':
        rna_protein_embeddings[cell] = data.rna_protein[cell].merge(AT2_g2p,left_index=True,
                                                                    right_index=True).merge(unirep_embeddings64,
                                                                                            left_on='Majority.protein.IDs',
                                                                                            left_index=False
                                                                                            ,right_index=True)

    else:
        rna_protein_embeddings[cell] = data.rna_protein[cell].merge(AM_g2p,left_index=True,
                                                                    right_index=True).merge(unirep_embeddings64,
                                                                                            left_on='Majority.protein.IDs',
                                                                                            left_index=False
                                                                                            ,right_index=True)
        
    #Drop unused columns, keep avgchrs
    rna_protein_embeddings[cell].drop(columns=['NormPosition','Gene Length'],inplace=True)
#     rna_protein_embeddings[cell].drop(columns=['AvgChrs','NormPosition','Gene Length'],inplace=True)
    
    #Unnormalize log2(x+1) transformation
    rna_protein_embeddings[cell].loc[:,[cell,cell+'_P']] = 2**rna_protein_embeddings[cell].loc[:,[cell,cell+'_P']]-1
    
    #add to index
    rna_protein_embeddings[cell]['cell']=[cell]*rna_protein_embeddings[cell].shape[0]
    rna_protein_embeddings[cell].set_index(['Majority.protein.IDs','cell'],append=True,inplace=True)
    
    #Rename mRNA and Protein quantification columns
    rna_protein_embeddings[cell].rename(columns={cell:'mRNA_TMM',cell+'_P':'ProteinAUC'},inplace=True)

In [12]:
print(rna_protein_embeddings['AT2_04M_F0'].shape)
rna_protein_embeddings['AT2_04M_F0'].head()

(3541, 68)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AvgChrs,mRNA_TMM,ProteinAUC,ProteinLength,0,1,2,3,4,5,...,54,55,56,57,58,59,60,61,62,63
Gene.names,Majority.protein.IDs,cell,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Gnai3,Q9DC51,AT2_04M_F0,2.675624,141.165966,43961890.0,354,-0.123134,0.126303,-0.092475,-0.969833,-0.00582,-0.057672,...,0.137047,0.008167,-0.088351,-0.018563,-0.036717,0.443771,-0.123508,-0.03699,0.424119,0.014253
Cox5a,P12787,AT2_04M_F0,8.46171,42.02146,857867900.0,146,0.01435,0.121865,-0.062778,-0.984908,-0.017123,-0.065598,...,0.11858,0.009217,0.136575,-0.059452,0.019777,0.42325,-0.06335,-0.025512,0.384949,0.015145
Dlat,Q8BMF4,AT2_04M_F0,8.406494,44.155177,132560900.0,642,-0.026584,0.109657,-0.081645,-0.963428,-0.030461,-0.162972,...,0.138321,0.016578,0.122076,-0.143501,0.013228,0.302376,-0.168613,-0.036329,0.351341,-0.014426
Gpr107,Q8BUV8,AT2_04M_F0,1.171243,74.816978,7839224.0,551,-0.02733,0.104179,0.021283,-0.953025,-0.020908,-0.239551,...,0.127384,0.011998,0.039601,-0.03098,-0.080105,0.244611,-0.092342,-0.039188,0.482943,-0.018493
Trim25,Q61510,AT2_04M_F0,10.729096,81.064321,22369380.0,634,-0.071294,0.072918,-0.114657,-0.975195,-0.041038,-0.088154,...,0.156678,0.041194,0.020387,-0.059393,-0.025968,0.263505,-0.113366,-0.038602,0.264744,0.012638


### Python 3 format

In [35]:
# with open('../../Data/ProcessedData/protein_embeddings/rna_protein_u64embeddings.pkl','wb') as file:
#     pickle.dump(rna_protein_embeddings,file,protocol=pickle.HIGHEST_PROTOCOL)

### Python 2 format

In [14]:
with open('../../Data/ProcessedData/protein_embeddings/rna_protein_u64embeddings_python2.pkl','wb') as file:
    pickle.dump(rna_protein_embeddings,file,protocol=2)

#### Merge all the data together to single dataframe to create train/test splits

In [36]:
data = pd.DataFrame(columns = list(rna_protein_embeddings[cell].columns)+list(rna_protein_embeddings[cell].index.names))
data.set_index(rna_protein_embeddings[cell].index.names,append=False,inplace=True)
for cell in cell_lines:
    data = data.append(rna_protein_embeddings[cell])
print(data.shape)

(28784, 68)


In [15]:
from sklearn.model_selection import train_test_split

#Returns pandas dataframes
train, test = train_test_split(data, test_size=0.2,random_state = 0,shuffle=True)

In [16]:
# with open('ppg_train.pkl','wb') as file:
#     pickle.dump(train,file,protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('ppg_test.pkl','wb') as file:
#     pickle.dump(test,file,protocol=pickle.HIGHEST_PROTOCOL)

#### Example Merge

In [17]:
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
                    'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']},index=[4,5,6,3,2,1])

In [18]:
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
                       'B': ['B0', 'B1', 'B2']},index=[2,3,1])

In [19]:
df

Unnamed: 0,key,A
4,K0,A0
5,K1,A1
6,K2,A2
3,K3,A3
2,K4,A4
1,K5,A5


In [20]:
other

Unnamed: 0,key,B
2,K0,B0
3,K1,B1
1,K2,B2


In [21]:
other.merge(df,left_index=True, right_index=True)

Unnamed: 0,key_x,B,key_y,A
2,K0,B0,K4,A4
3,K1,B1,K3,A3
1,K2,B2,K5,A5
