In [173]:
import numpy as np
import pandas as pd
import Bio.SeqIO as SeqIO

In [174]:
data = pd.read_excel("/home/anwer/Desktop/PPI_prediction/Dataset_ppi/Denovo/AS1.xlsx")

In [176]:
data = data[['Protein A','Protein B']]
data

Unnamed: 0,Protein A,Protein B
0,P29991,O00571
1,P29991,O00584
2,Q99IB8,O00584
3,P03070,O00629
4,P49116,P10221
...,...,...
5748,Q93034,P69721
5749,Q9HC16,P69721
5750,Q9HC16,P69720
5751,Q9HC16,P69722


## 1. Data Preprocessing

### 1.1 Protein name to sequence

In [177]:
from tqdm import tqdm
dct_1 = {}
for rec in tqdm(SeqIO.parse("/home/anwer/Desktop/PPI_prediction/Dataset_ppi/Denovo/AS2.fasta","fasta")):
    dct_1[rec.id.split("|")[1]] = str(rec.seq)
for rec in tqdm(SeqIO.parse("/home/anwer/Desktop/PPI_prediction/Dataset_ppi/Denovo/AS3.fasta","fasta")):
    dct_1[rec.id.split("|")[1]] = str(rec.seq)    

445it [00:00, 51416.36it/s]
2340it [00:00, 103193.93it/s]


In [178]:
def convert_to_seq(seq_name):
    try:
        return dct_1[seq_name]
    except:
        return None

In [179]:
data["Protein_A_sequence"] = data['Protein A'].apply(convert_to_seq)
data["Protein_B_sequence"] = data['Protein B'].apply(convert_to_seq)
data.dropna(inplace = True)

In [180]:
data = data.apply(lambda x: x.str.strip())
data.to_csv("/home/anwer/Desktop/PPI_prediction/csv_files/Denovo_ppi_with_sequences.csv",index = False)

### 1.2 Exploratory Data Analysis

In [181]:
data = pd.read_csv('/home/anwer/Desktop/PPI_prediction/csv_files/Denovo_ppi_with_sequences.csv')

In [182]:
data.head()

Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence
0,P29991,O00571,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MSHVAVENALGLDQQFAGLDLNSSDNQSGGSTASKGRYIPPHLRNR...
1,P29991,O00584,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
2,Q99IB8,O00584,MSTNPKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLGV...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
3,P03070,O00629,MDKVLNREESLQLMDLLGLERSAWGNIPLMRKAYLKKCKEFHPDKG...,MADNEKLDNQRLKNFKNKGRDLETMRRQRNEVVVELRKNKRDEHLL...
4,P49116,P10221,MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGS...,MADRGLPSEAPVVTTSPAGPPSDGPMQRLLASLAGLRQPPTPTAET...


In [183]:
data = data.apply(lambda x: x.str.strip())

In [184]:
print("Number of positive pairs: ", len(data))

Number of positive pairs:  5718


In [139]:
print("Number of unique Proteins: ", len(set(data['Protein A']).union(set(data['Protein B']))) )

Number of unique Proteins:  2781


In [268]:
print("Maximum length proteins: ", max(set(data['Protein_A_sequence']).union(set(data['Protein_B_sequence']))))
print('\n' )
print("Maximum length of Proteins sequences: ", len(max(set(data['Protein_A_sequence']).union(set(data['Protein_B_sequence'])))))

Maximum length proteins:  YITPVNSLEKHSWYHGPVSRNAAEYLLSSGINGSFLVRESESSPGQRSISLRYEGRVYHYRINTASDGKLYVSSESRFNTLAELVHHHSTVADGLITTLHYPAPKRNKPTIYGVSPNYDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIITEFMTYGNLLDYLRECNRQEVSAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDEVEKELGKRGTRGGAGSMLQAPELPTKTRTCRRAAEQKASPPSLTPKLLRRQVTASPSSGLSHKKEATKGSASGMGTPATAEPAPPSNKVGLSKASSEEMRVRRHKHSSESPGRDKGRLAKLKPAPPPPPACTGKAGKPAQSPSQEAGEAGGPTKTKCTSLAMDAVNTDPTKAGPPGEGLRKPVPPSVPKPQSTAKPPGTPTSPVSTPSTAPAPSPLAGDQQPSSAAFIPLISTRVSLRKTRQPPERIASGTITKGVVLDSTEALCLAISRNSEQMASHSAVLEAGKNLYTFCVSYVDSIQQMRNKFAFREAINKLESNLRELQICPATASSGPAATQDFSKLLSSVKEISDIVRR


Maximum length of Proteins sequences:  746


### 1.3 Creating independent test set

In [269]:
data

Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence
0,P29991,O00571,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MSHVAVENALGLDQQFAGLDLNSSDNQSGGSTASKGRYIPPHLRNR...
1,P29991,O00584,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
2,Q99IB8,O00584,MSTNPKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLGV...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...
3,P03070,O00629,MDKVLNREESLQLMDLLGLERSAWGNIPLMRKAYLKKCKEFHPDKG...,MADNEKLDNQRLKNFKNKGRDLETMRRQRNEVVVELRKNKRDEHLL...
4,P49116,P10221,MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGS...,MADRGLPSEAPVVTTSPAGPPSDGPMQRLLASLAGLRQPPTPTAET...
...,...,...,...,...
5713,Q93034,P69721,MATSNLLKNKGSLQFEDKWDFMRPIVLKLLRQESVTKQQWFDLFSD...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...
5714,Q9HC16,P69721,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...
5715,Q9HC16,P69720,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...
5716,Q9HC16,P69722,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...


In [270]:
data = data.sample(frac=1,random_state = 42).reset_index(drop=True)

In [361]:
hash_table_train_proteins = {}
hash_table_test_proteins = {}
percent_10 = len(data)*0.1
percent_10

571.8000000000001

In [185]:
## Using set

def check_hash_table():
    starting_point_data = data[0:1]
    data_left = data[1:]
    protein_unique_initial_train = set(starting_point_data['Protein A']).union(set(starting_point_data['Protein B']))
    sum=0
    protein_unique_initial_test = set()
    for index,row in data_left.iterrows():
        if row['Protein A'] in protein_unique_initial_train or row['Protein B'] in protein_unique_initial_train:
            
            protein_unique_initial_train.add(row['Protein A'])
            protein_unique_initial_train.add(row['Protein B'])
            
            if row['Protein B'] in protein_unique_initial_test:
                protein_unique_initial_test.remove(row['Protein B'])
                sum = sum + 1
            if row['Protein A'] in protein_unique_initial_test:
                protein_unique_initial_test.remove(row['Protein A'])
                sum = sum +1
            
        else:
            
            protein_unique_initial_test.add(row['Protein A'])
            protein_unique_initial_test.add(row['Protein B'])
    print(sum)
    return protein_unique_initial_train,protein_unique_initial_test            

In [206]:
li_1 = []
li_2 = []

def check_hash_table():
    starting_point_data = data[0:1]
    data_left = data[1:]
    li_1.extend(starting_point_data['Protein A'].tolist())
    li_1.extend(starting_point_data['Protein B'].tolist())
    print(li_1)
    sum=0
    
    for index,row in data_left.iterrows():
        if row['Protein A'] in li_1 or row['Protein B'] in li_1:
            li_1.append(row['Protein A'])
            li_1.append(row['Protein B'])
            
            if row['Protein B'] in li_2:
                li_2.remove(row['Protein B'])
                sum = sum + 1
            if row['Protein A'] in li_2:
                li_2.remove(row['Protein A'])
                sum = sum +1
            
        else:
            
            li_2.append(row['Protein A'])
            li_2.append(row['Protein B'])
    print(sum)
    return li_1,li_2

In [207]:
protein_unique_initial_train,protein_unique_initial_test = check_hash_table()

['P29991', 'O00571']
878


In [218]:
li_1

['P29991',
 'O00571',
 'P29991',
 'O00584',
 'Q99IB8',
 'O00584',
 'P29991',
 'A5YKK6',
 'P29991',
 'A6NCC3',
 'P63279',
 'P29991',
 'P68466',
 'O00571',
 'P68466',
 'O14654',
 'P68466',
 'O15020',
 'P68466',
 'P04908',
 'P68466',
 'P05166',
 'P68466',
 'P07437',
 'P68466',
 'P0C0S8',
 'P68466',
 'P11021',
 'P68466',
 'P13051',
 'P68466',
 'P20671',
 'P68466',
 'P62805',
 'Q00610',
 'P68466',
 'Q01082',
 'P68466',
 'Q08AE8',
 'P68466',
 'Q13085',
 'P68466',
 'Q16777',
 'P68466',
 'Q53SF7',
 'P68466',
 'Q5EBL4',
 'P68466',
 'Q6FI13',
 'P68466',
 'Q7L7L0',
 'P68466',
 'Q93077',
 'P68466',
 'Q96KK5',
 'P68466',
 'Q99878',
 'P68466',
 'Q9BTM1',
 'P68466',
 'Q9BYP7',
 'P68466',
 'Q9H4A3',
 'P68466',
 'Q9HCC0',
 'P68466',
 'Q9Y3S1',
 'P68466',
 'P69284',
 'P11021',
 'P69284',
 'P22528',
 'P69284',
 'P30153',
 'P69284',
 'P31151',
 'P69284',
 'P35321',
 'Q06830',
 'P69284',
 'Q71U36',
 'P69284',
 'Q9BQE3',
 'P69284',
 'P63279',
 'F5HCV3',
 'Q06330',
 'F5HCV3',
 'P29991',
 'O14763',
 'P29991',

In [217]:
set(li_1).intersection(set(li_2))

{'A0MPS7',
 'F5HCV3',
 'O14503',
 'O14641',
 'O14920',
 'O15111',
 'O15350',
 'O15460',
 'O60885',
 'O75376',
 'O75475',
 'O75489',
 'P01730',
 'P03105',
 'P03186',
 'P03203',
 'P03225',
 'P03407',
 'P03428',
 'P03431',
 'P03433',
 'P03519',
 'P04012',
 'P04492',
 'P08392',
 'P21673',
 'P25686',
 'P29590',
 'P46379',
 'P49736',
 'P50750',
 'P51116',
 'P52292',
 'P61758',
 'P61956',
 'P62877',
 'P63165',
 'P63208',
 'P68336',
 'P68467',
 'P69258',
 'P69713',
 'P69720',
 'P69721',
 'P69722',
 'P69723',
 'Q00653',
 'Q00987',
 'Q07812',
 'Q12933',
 'Q16659',
 'Q2I360',
 'Q5JY77',
 'Q69139',
 'Q6TVJ7',
 'Q7M6R1',
 'Q86VP6',
 'Q8V1H6',
 'Q96J02',
 'Q99615',
 'Q9BZK7',
 'Q9C0C7',
 'Q9NR30',
 'Q9NSC5',
 'Q9UMS4',
 'Q9Y4B6'}

In [208]:
# protein_unique_initial_test.intersection(protein_unique_initial_train)

In [203]:
2781 - len(protein_unique_initial_test)

2366

In [191]:
len(protein_unique_initial_train.union(protein_unique_initial_test))

2781

In [209]:
li = []
for index,row in data.iterrows():
    if row['Protein A'] in li_1 or row['Protein B'] in li_1:
        li.append(1)
    else:
        li.append(0)
        
 

In [210]:
sum(li)

TypeError: 'int' object is not callable

In [211]:
data["is_train"] = li

In [212]:
train_data = data[data.is_train == 1].reset_index(drop = True)
train_data

Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence,is_train
0,P29991,O00571,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MSHVAVENALGLDQQFAGLDLNSSDNQSGGSTASKGRYIPPHLRNR...,1
1,P29991,O00584,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...,1
2,Q99IB8,O00584,MSTNPKPQRKTKRNTNRRPEDVKFPGGGQIVGGVYLLPRRGPRLGV...,MRPAALRGALLGCLCLALLCLGGADKRLRDNHEWKKLIMVQHWPET...,1
3,P03070,O00629,MDKVLNREESLQLMDLLGLERSAWGNIPLMRKAYLKKCKEFHPDKG...,MADNEKLDNQRLKNFKNKGRDLETMRRQRNEVVVELRKNKRDEHLL...,1
4,P49116,P10221,MTSPSPRIQIISTDSAVASPQRIQIVTDQQTGQKIQIVTAVDASGS...,MADRGLPSEAPVVTTSPAGPPSDGPMQRLLASLAGLRQPPTPTAET...,1
...,...,...,...,...,...
5583,Q93034,P69721,MATSNLLKNKGSLQFEDKWDFMRPIVLKLLRQESVTKQQWFDLFSD...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...,1
5584,Q9HC16,P69721,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...,1
5585,Q9HC16,P69720,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...,1
5586,Q9HC16,P69722,MKPHFRNTVERMYRDTFSYNFYNRPILSRRNTVWLCYEVKTKGPSR...,MENRWQVMIVWQVDRMRIRTWKSLVKHHMYVSGKARGWFYRHHYES...,1


In [213]:
test_data = data[data.is_train == 0].reset_index(drop = True)
test_data

Unnamed: 0,Protein A,Protein B,Protein_A_sequence,Protein_B_sequence,is_train
0,P63231,O43186,MSLLTEVETPIRNEWGCRCNDSSDPLVVAASIIGILHLILWILDRL...,MMAYMNPGPHYSVNALALSGPSVDLMHQAVPYPSAPRKQRRERTTF...,0
1,P63233,P50222,MSLLTEVETYVLSIVPSGPLKAEIAQRLEDVFAGKNTDLEALMEWL...,MEHPLFGCLRSPHATAQGLHPFSQSSLALHGRSDHMSYPELSTSSS...,0
2,Q13794,P68451,MPGKKARKNAQPSPARAPAELEVECATQLRRFGDKLNFRQKLLNLI...,MLSMFMCNNIVDYVDDIDNGIVQDIEDEASNNVDHDYVYPLPENMV...,0
3,Q9BXH1,P68451,MARARQEGSSPEPVEGLARDGPRPFPLGRLVPSAVSCGLCEPGLAA...,MLSMFMCNNIVDYVDDIDNGIVQDIEDEASNNVDHDYVYPLPENMV...,0
4,P36406,F5HAM0,MATLVVNKLGAGVDSGRQGSRGTAVVKVLECGVCEDVFSLQGDKVP...,MKPLVMLICFGVILLQLGVTKVCQHNEVQLGNECCPPCGLGQRVTK...,0
...,...,...,...,...,...
125,Q9UHY1,P14340,MSEGESQTVLSSGSDPKVESSSSAPGLTSVSPPVTSTTSAASPEEE...,MNNQRKKARNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...,0
126,Q9IH62,P52799,MPAENKKVRFENTTSDKGKIPSKVIKSYYGTMDIKKINEGLLDSKI...,MAVRRDSVWKYCWGVLMVLCRTAISKSIVLEPIYWNSSNSKFLPGQ...,0
127,Q96SB4,A7Y3Z3,MERKVLALQARKKRTKAKKDKAQRKSETQHRGSAPHSESDLPEQEE...,MADPAAATKYPLLKLLGSTWPTTPPRPIPKPSPWAPKKHRRLSSDQ...,0
128,Q9UPY3,P0C205,MKSPALQPLSMAGLQLMTPASSPMGPFFGLPWQQEAIHDNIYTPRK...,MPKTRRRPRRSQRKRPPTPWPTSQGLDRVFFSDTQSTCLETVYKAT...,0


In [216]:
## checking the overlapping

train_data_proteins = set(train_data['Protein A']).union(set(train_data['Protein B']))
test_data_proteins = set(test_data['Protein A']).union(set(test_data['Protein B']))
common_proteins = train_data_proteins.intersection(test_data_proteins)
print("Number of common proteins between test set and train set: ", len(common_proteins))

Number of common proteins between test set and train set:  32


In [215]:
for i in range(10):
    try:
        asdasd
        print(2)
    except:
        pass
    try:
        print(1)
    except: 
        print("2nd pass")

1
1
1
1
1
1
1
1
1
1


In [199]:
data['is_train'].value_counts()

1    5588
0     130
Name: is_train, dtype: int64

In [200]:
x = {1,2,3}
x.add(5)
x

{1, 2, 3, 5}

In [113]:
sum = 0
for index,row in train_data.iterrows():
    if row['Protein A'] in common_proteins or row['Protein B'] in common_proteins:
#         print(f"{row['Protein A']}, {row['Protein B']}")
        sum = sum +1
sum        

64

In [114]:
sum = 0
for index,row in test_data.iterrows():
    if row['Protein A'] in common_proteins or row['Protein B'] in common_proteins:
        print(f"{row['Protein A']}, {row['Protein B']}")
        sum = sum +1
sum 

P63231, O43186
P63233, P50222
Q13794, P68451
Q9BXH1, P68451
P36406, F5HAM0
O15294, F5HE12
Q15020, F5HE12
P30260, F5HH39
Q9UJX2, F5HH39
P50222, P03468
P51610, P30020
Q4G0F5, A8E1H1
Q96QK1, A8E1H1
Q9H171, A8E1H1
Q9Y572, A8E1H1
P89654, P15498
Q14457, P89884
Q6UXK2, Q03463
P03452, O43889
P03468, O43889
P63231, O43889
Q03463, O95466
P31749, P03165
P03185, O75094
P78317, P03209
P10144, P03265
Q8WXE1, P03265
P03418, O95786
Q9Y5P4, P03452
Q96D09, Q3KSQ2
Q96SI9, Q03463
Q9BXM8, Q03463
Q9H2W1, Q03463
Q9UHF1, A8CSJ8
Q9NRW3, P89903
Q9Y618, P24768
Q9Y3Q8, Q3KSQ2
Q3KSQ2, Q16206
Q69559, P28482
Q3KSN9, Q15723
Q16206, P03206
Q6VMQ6, P03209
Q00577, P03072
P98179, P68318
P51610, P04293
P51610, P10236
P68451, O43521
Q9J0X9, P25963
P29846, O43889


49

In [63]:
# test_data