In [4]:
import numpy as np
import pandas as pd
import Bio.SeqIO as SeqIO
from tqdm import tqdm

In [21]:
data = pd.read_excel("/home/anwer/Desktop/PPI_prediction/Dataset_ppi/Denovo/AS1.xlsx")

In [22]:
data = data[['Protein A','Protein B']]
data

Unnamed: 0,Protein A,Protein B
0,P29991,O00571
1,P29991,O00584
2,Q99IB8,O00584
3,P03070,O00629
4,P49116,P10221
...,...,...
5748,Q93034,P69721
5749,Q9HC16,P69721
5750,Q9HC16,P69720
5751,Q9HC16,P69722


## 1. Data Preprocessing

### 1.1 Protein name to sequence

In [23]:

dct_1 = {}
for rec in tqdm(SeqIO.parse("/home/anwer/Desktop/PPI_prediction/Dataset_ppi/Denovo/AS2.fasta","fasta")):
    dct_1[rec.id.split("|")[1]] = str(rec.seq)
for rec in tqdm(SeqIO.parse("/home/anwer/Desktop/PPI_prediction/Dataset_ppi/Denovo/AS3.fasta","fasta")):
    dct_1[rec.id.split("|")[1]] = str(rec.seq)    

445it [00:00, 95053.23it/s]
2340it [00:00, 121853.27it/s]


In [24]:
def convert_to_seq(seq_name):
    try:
        return dct_1[seq_name]
    except:
        return None

In [25]:
data["Protein_A_sequence"] = data['Protein A'].apply(convert_to_seq)
data["Protein_B_sequence"] = data['Protein B'].apply(convert_to_seq)
data.dropna(inplace = True)

In [26]:
data = data.apply(lambda x: x.str.strip())
data.to_csv("/home/anwer/Desktop/PPI_prediction/csv_files/Denovo_ppi_with_sequences.csv",index = False)

### 1.2 Exploratory Data Analysis

In [9]:
data = pd.read_csv('../csv_files/Denovo_ppi_with_sequences.csv')

In [27]:
data.head()


Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence
0,P29991,O00571,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MSHVAVENALGLDQQFAGLDLNSSDNQSGGSTASKGRYIPPHLRNR...
1,P29991,O00584,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
2,Q99IB8,O00584,MSTNPKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLGV...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
3,P03070,O00629,MDKVLNREESLQLMDLLGLERSAWGNIPLMRKAYLKKCKEFHPDKG...,MADNEKLDNQRLKNFKNKGRDLETMRRQRNEVVVELRKNKRDEHLL...
4,P49116,P10221,MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGS...,MADRGLPSEAPVVTTSPAGPPSDGPMQRLLASLAGLRQPPTPTAET...


In [28]:
data = data.apply(lambda x: x.str.strip())

In [29]:
print("Number of positive pairs: ", len(data))

Number of positive pairs:  5718


In [30]:
print("Number of unique Proteins: ", len(set(data['Protein A']).union(set(data['Protein B']))) )

Number of unique Proteins:  2781


In [31]:
print("Maximum length proteins: ", max(set(data['Protein_A_sequence']).union(set(data['Protein_B_sequence']))))
print('\n' )
print("Maximum length of Proteins sequences: ", len(max(set(data['Protein_A_sequence']).union(set(data['Protein_B_sequence'])))))

Maximum length proteins:  YITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTIYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVSAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKRGTRGGAGSMLQAPELPTKTRTCRRAAEQKASPPSLTPKLLRRQVTASPSSGLSHKKEATKGSASGMGTPATAEPAPPSNKVGLSKASSEEMRVRRHKHSSESPGRDKGRLAKLKPAPPPPPACTGKAGKPAQSPSQEAGEAGGPTKTKCTSLAMDAVNTDPTKAGPPGEGLRKPVPPSVPKPQSTAKPPGTPTSPVSTPSTAPAPSPLAGDQQPSSAAFIPLISTRVSLRKTRQPPERIASGTITKGVVLDSTEALCLAISRNSEQMASHSAVLEAGKNLYTFCVSYVDSIQQMRNKFAFREAINKLESNLRELQICPATASSGPAATQDFSKLLSSVKEISDIVRR


Maximum length of Proteins sequences:  746


### 1.3 Creating pickle file for all unique proteins embedded vectors

In [2]:
data = pd.read_csv('../csv_files/Denovo_ppi_with_sequences.csv')
data.head()

Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence
0,P29991,O00571,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MSHVAVENALGLDQQFAGLDLNSSDNQSGGSTASKGRYIPPHLRNR...
1,P29991,O00584,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
2,Q99IB8,O00584,MSTNPKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLGV...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
3,P03070,O00629,MDKVLNREESLQLMDLLGLERSAWGNIPLMRKAYLKKCKEFHPDKG...,MADNEKLDNQRLKNFKNKGRDLETMRRQRNEVVVELRKNKRDEHLL...
4,P49116,P10221,MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGS...,MADRGLPSEAPVVTTSPAGPPSDGPMQRLLASLAGLRQPPTPTAET...


In [10]:
unique_proteins_with_sequence = {}
for index,row in tqdm(data.iterrows()):
    unique_proteins_with_sequence[row['Protein A']] = ' '.join(row['Protein_A_sequence'])
    unique_proteins_with_sequence[row['Protein B']] = ' '.join(row['Protein_B_sequence'])

5718it [00:00, 18790.93it/s]


In [7]:
len(unique_proteins_with_sequence)

2781

In [17]:
from embedded_vectors_extract.extract_embedding_vectors import return_embedding_vectors_from_dict

In [19]:
prot_to_vec = return_embedding_vectors_from_dict(unique_proteins_with_sequence,"prot_bert_bfd")

### 1.4 Creating independent test set

In [82]:
data = pd.read_csv('../csv_files/Denovo_ppi_with_sequences.csv')
data

Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence
0,P29991,O00571,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MSHVAVENALGLDQQFAGLDLNSSDNQSGGSTASKGRYIPPHLRNR...
1,P29991,O00584,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
2,Q99IB8,O00584,MSTNPKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLGV...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
3,P03070,O00629,MDKVLNREESLQLMDLLGLERSAWGNIPLMRKAYLKKCKEFHPDKG...,MADNEKLDNQRLKNFKNKGRDLETMRRQRNEVVVELRKNKRDEHLL...
4,P49116,P10221,MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGS...,MADRGLPSEAPVVTTSPAGPPSDGPMQRLLASLAGLRQPPTPTAET...
...,...,...,...,...
5713,Q93034,P69721,MATSNLLKNKGSLQFEDKWDFMRPIVLKLLRQESVTKQQWFDLFSD...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...
5714,Q9HC16,P69721,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...
5715,Q9HC16,P69720,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...
5716,Q9HC16,P69722,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...


In [16]:
# data = data.sample(frac=1,random_state = 42).reset_index(drop=True)

In [19]:
# data = pd.read_csv('../csv_files/Denovo_ppi_with_sequences.csv')
# data = data.iloc[0:50]


# def check_hash_table(data):
#     starting_point_data = data[0:1]
#     data_left = data[1:]
#     protein_unique_initial_train = set(starting_point_data['Protein A']).union(set(starting_point_data['Protein B']))
    
#     sum=0
#     protein_unique_initial_test = set()
#     for index,row in data_left.iterrows():
        
#         if row['Protein A'] in protein_unique_initial_train or row['Protein B'] in protein_unique_initial_train:
            
#             protein_unique_initial_train.add(row['Protein A'])
#             protein_unique_initial_train.add(row['Protein B'])
            
#             if row['Protein B'] in protein_unique_initial_test:
#                 protein_unique_initial_test.remove(row['Protein B'])
#                 sum = sum + 1
#             if row['Protein A'] in protein_unique_initial_test:
#                 protein_unique_initial_test.remove(row['Protein A'])
#                 sum = sum +1
            
#         else:
            
#             protein_unique_initial_test.add(row['Protein A'])
#             protein_unique_initial_test.add(row['Protein B'])
    
#     return protein_unique_initial_train,protein_unique_initial_test  

# protein_unique_initial_train = check_hash_table()



In [76]:
#  li = []
# for index,row in data.iterrows():
   
#     li.append([row['Protein A'],row['Protein B']])

# li_test = []
# for i in li:
    
#     if i not in li_train:
#        li_test.append(i)

In [16]:
# len(li_test) + len(li_train)
# train_data = pd.DataFrame(li_train, columns =['Protein A', 'Protein B'])
# test_data = pd.DataFrame(li_test, columns =['Protein A', 'Protein B'])
        
        

In [14]:
# train_data_proteins = set(train_data['Protein A']).union(set(train_data['Protein B']))
# test_data_proteins = set(test_data['Protein A']).union(set(test_data['Protein B']))
# common_proteins = train_data_proteins.intersection(test_data_proteins)
# print("Number of common proteins between test set and train set: ", len(common_proteins))

In [13]:
# def is_check(row):
#     if row in un:
#         return 1
#     else:
#         return 0
    
# data['is_train_1']= data['Protein A'].apply(is_check)
# data['is_train_2']= data['Protein B'].apply(is_check)
# data['is_train'] = data['is_train_1'] + data['is_train_2']
# test = data[data.is_train == 0]
# train = data[data.is_train != 0]
# train

In [12]:


# def check_hash_table():
#     starting_point_data = data[0:1]
#     data_left = data[1:]
#     li_1 = []
#     li_2 = []
    
#     li_1.extend(starting_point_data['Protein A'].tolist())
#     li_1.extend(starting_point_data['Protein B'].tolist())
#     print(li_1)
#     sum=0
    
#     for index,row in data_left.iterrows():
#         sum = sum +1
#         if str(row['Protein A']) in li_1 or str(row['Protein B']) in li_1:
          
#             li_2.append([str(row['Protein A']),str(row['Protein B'])])
            
#             li_1.append(str(row['Protein A']))
#             li_1.append(str(row['Protein B']))
           
            
            
                
  
#     return li_2
# protein_unique_initial_train = check_hash_table()
# print(len(protein_unique_initial_train))


In [11]:
#  li = []
# for index,row in data.iterrows():
   
#     li.append([row['Protein A'],row['Protein B']])

# li_test = []
# for i in li:
    
#     if i not in protein_unique_initial_train:
#        li_test.append(i)

In [10]:
# len(li_test) + len(li_train)
# train_data = pd.DataFrame(li_train, columns =['Protein A', 'Protein B'])
# test_data = pd.DataFrame(li_test, columns =['Protein A', 'Protein B'])

In [9]:
# train_data_proteins = train_data['Protein A'].append(train_data['Protein B'])
# test_data_proteins = test_data['Protein A'].append(test_data['Protein B'])
# # test_data_proteins = set(test_data['Protein A']).union(set(test_data['Protein B']))
# common_proteins = train_data_proteins.intersection(test_data_proteins)
# print("Number of common proteins between test set and train set: ", len(common_proteins))

In [8]:
# for i in train_data_proteins:
#     if i in test_data_proteins:
#         print(i)


In [7]:
# len(set(train_data_proteins))+len(set(test_data_proteins))

In [6]:
# train_data = data[data.is_train == 1].reset_index(drop = True)
# train_data
 

In [5]:
# test_data = data[data.is_train == 0].reset_index(drop = True)
# test_data

In [4]:
# ## checking the overlapping

# train_data_proteins = set(train_data['Protein A']).union(set(train_data['Protein B']))
# test_data_proteins = set(test_data['Protein A']).union(set(test_data['Protein B']))
# common_proteins = train_data_proteins.intersection(test_data_proteins)
# print("Number of common proteins between test set and train set: ", len(common_proteins))

In [3]:
# data['is_train'].value_counts()

In [1]:
# sum = 0
# li_3 = []
# for index,row in train_data.iterrows():
#     if row['Protein A'] in common_proteins or row['Protein B'] in common_proteins:
# #          print(f"{row['Protein A']}, {row['Protein B']}")
#             li_3.append(row['Protein A'])
#             li_3.append(row['Protein B'])
# li_3             

In [2]:
# li_4= []
# for index,row in test_data.iterrows():
#     if row['Protein A'] in common_proteins or row['Protein B'] in common_proteins:
# #         print(f"{row['Protein A']}, {row['Protein B']}")
#             li_4.append(row['Protein A'])
#             li_4.append(row['Protein B'])
# li_4

In [63]:
# test_data

In [312]:
# set(li_3).union(li_4)-set(li_3).intersection(li_4)