In [1]:

import json
import numpy as np
import pandas as pd
import Bio.SeqIO as SeqIO
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt


## 1. ZIKA - Human
### 1.1 Data Preprocessing

In [23]:
data_zika = pd.read_csv("../Dataset_ppi/Transfer_Learning_multi_scale_convolutional_neural_layers_human_virus/ZIKV/protein_pair_label.txt",sep = "\t")
data_zika  = data_zika[['PRO1','PRO2']]
data_zika.head()

Unnamed: 0,PRO1,PRO2
0,Q86VU5,A0A024B7W1-PRO_0000443029
1,Q9H5H4,Q32ZE1-PRO_0000435829
2,O96011,A0A024B7W1-PRO_0000443029
3,Q6PCB5,Q32ZE1-PRO_0000435829
4,P41223,A0A024B7W1-PRO_0000443032


In [11]:
data_zika_seq = pd.read_csv("../Dataset_ppi/Transfer_Learning_multi_scale_convolutional_neural_layers_human_virus/ZIKV/pro_seq.txt",sep = "\t")
data_zika_seq.head()

Unnamed: 0,ID-NUM,SEQ
0,Q9BPY8,MSAETASGPTEDQVEILEYNFNKVDKHPDSTTLCLIAAEAGLSEEE...
1,Q9HBA9,MGGSAPPDSSWRGSLKVSYNVGPGFTGNFSTQKVKMHIHSTNEVTR...
2,Q96S38,MTSYRERSADLARFYTVTEPQRHPRGYTVYKVTARVVSRRNPEDVQ...
3,Q9BYW2,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...
4,P27986,MSAEGYQYRALYDYKKEREEDIDLHLGDILTVNKGSLVALGFSDGQ...


In [16]:
data_zika_seq_dct = {}
for index,row in data_zika_seq.iterrows():
    
    if len(row.SEQ) <= 1024:
        data_zika_seq_dct[row['ID-NUM']] = row.SEQ
    else:
        continue

In [17]:
def return_seq(id):
    try:
        return data_zika_seq_dct[id]
    except:
        return np.nan

In [28]:
data_zika["protein1_seq"] = data_zika['PRO1'].apply(return_seq)
data_zika["protein2_seq"] = data_zika['PRO2'].apply(return_seq)

In [29]:
data_zika

Unnamed: 0,PRO1,PRO2,protein1_seq,protein2_seq
0,Q86VU5,A0A024B7W1-PRO_0000443029,MTQPVPRLSVPAALALGSAALGAAFATGLFLGRRCPPWRGRREQCL...,GAAFGVMEALGTLPGHMTERFQEAIDNLAVLMRAETGSRPYKAAAA...
1,Q9H5H4,Q32ZE1-PRO_0000435829,MEREALPWGLEPQDVQSSDEMRSPEGYLRGNMSENEEEEISQQEGS...,MKNPKEEIRRIRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...
2,O96011,A0A024B7W1-PRO_0000443029,MDAWVRFSAQSQARERLCRAAQYACSLLGHALQRHGASPELQKQIR...,GAAFGVMEALGTLPGHMTERFQEAIDNLAVLMRAETGSRPYKAAAA...
3,Q6PCB5,Q32ZE1-PRO_0000435829,MAEPPSPVHCVAAAAPTATVSEKEPFGKLQLSSRDPPGSLSAKKVR...,MKNPKEEIRRIRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...
4,P41223,A0A024B7W1-PRO_0000443032,MPKVKRSRKAPPDGWELIEPTLDELDQKMREAETEPHEGKRKVESL...,GGGTGETLGEKWKARLNQMSALEFYSYKKSGITEVCREEARRALKD...
...,...,...,...,...
7794,Q9C0J8,Q32ZE1-PRO_0000435838,,SGALWDVPAPKEVKKGETTDGVYRVMTRRLLGSTQVGVGVMQEGVF...
7795,Q6IS24,A0A024B7W1-PRO_0000443019,MASLRRVKVLLVLNLIAVAGFVLFLAKCRPIAVRSGDAFHEIRPRA...,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...
7796,Q96K31,Q32ZE1-PRO_0000435829,MDSGCWLFGGEFEDSVFEERPERRSGPPASYCAKLCEPQWFYEETE...,MKNPKEEIRRIRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...
7797,Q12946,Q32ZE1-PRO_0000435839,MSSAPEKQQPPHGGGGGGGGGGGAAMDPASSGPSKAKKTNAGIRRP...,GAALGVMEALGTLPGHMTERFQEAIDNLAVLMRAETGSRPYKAAAA...


In [32]:
data_zika.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
data_zika.dropna(inplace = True)

In [33]:
data_zika

Unnamed: 0,PRO1,PRO2,protein1_seq,protein2_seq
0,Q86VU5,A0A024B7W1-PRO_0000443029,MTQPVPRLSVPAALALGSAALGAAFATGLFLGRRCPPWRGRREQCL...,GAAFGVMEALGTLPGHMTERFQEAIDNLAVLMRAETGSRPYKAAAA...
1,Q9H5H4,Q32ZE1-PRO_0000435829,MEREALPWGLEPQDVQSSDEMRSPEGYLRGNMSENEEEEISQQEGS...,MKNPKEEIRRIRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...
2,O96011,A0A024B7W1-PRO_0000443029,MDAWVRFSAQSQARERLCRAAQYACSLLGHALQRHGASPELQKQIR...,GAAFGVMEALGTLPGHMTERFQEAIDNLAVLMRAETGSRPYKAAAA...
3,Q6PCB5,Q32ZE1-PRO_0000435829,MAEPPSPVHCVAAAAPTATVSEKEPFGKLQLSSRDPPGSLSAKKVR...,MKNPKEEIRRIRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...
4,P41223,A0A024B7W1-PRO_0000443032,MPKVKRSRKAPPDGWELIEPTLDELDQKMREAETEPHEGKRKVESL...,GGGTGETLGEKWKARLNQMSALEFYSYKKSGITEVCREEARRALKD...
...,...,...,...,...
7793,Q8NB37,Q32ZE1-PRO_0000435838,MASERLPNRPACLLVASGAAEGVSAQSFLHCFTMASTAFNLQVATP...,SGALWDVPAPKEVKKGETTDGVYRVMTRRLLGSTQVGVGVMQEGVF...
7795,Q6IS24,A0A024B7W1-PRO_0000443019,MASLRRVKVLLVLNLIAVAGFVLFLAKCRPIAVRSGDAFHEIRPRA...,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...
7796,Q96K31,Q32ZE1-PRO_0000435829,MDSGCWLFGGEFEDSVFEERPERRSGPPASYCAKLCEPQWFYEETE...,MKNPKEEIRRIRIVNMLKRGVARVNPLGGLKRLPAGLLLGHGPIRM...
7797,Q12946,Q32ZE1-PRO_0000435839,MSSAPEKQQPPHGGGGGGGGGGGAAMDPASSGPSKAKKTNAGIRRP...,GAALGVMEALGTLPGHMTERFQEAIDNLAVLMRAETGSRPYKAAAA...


### 1.2 Exploratory Data Analysis for Zika-Human

In [38]:
print("Number of unique Zika-Human Pairs: ", len(data_zika)) 
print("Number of unique Proteins: ", len(set(data_zika['PRO1']).union(set(data_zika['PRO2']))) )

Number of unique Zika-Human Pairs:  6882
Number of unique Proteins:  5740


## 2. Dengue - Human
### 2.1 Data Preprocessing

In [48]:
data_dengue = pd.read_csv("../Dataset_ppi/Transfer_Learning_multi_scale_convolutional_neural_layers_human_virus/DENV/protein_pair_label.txt",sep = "\t")
data_dengue  = data_dengue[['PRO1','PRO2']]
data_dengue

Unnamed: 0,PRO1,PRO2
0,Q8NBQ5,P29990-PRO_0000037929
1,Q6DD87,P29991-PRO_0000037946
2,P35222,P29991-PRO_0000037941
3,P17858,P29990-PRO_0000037929
4,P51398,P29991-PRO_0000037936
...,...,...
10192,Q9BV79,P29990-PRO_0000037934
10193,P52565,P29991-PRO_0000037940
10194,Q15147,P29991-PRO_0000037946
10195,Q8WTU2,P14340-PRO_0000037966


In [42]:
data_dengue_seq = pd.read_csv("../Dataset_ppi/Transfer_Learning_multi_scale_convolutional_neural_layers_human_virus/DENV/pro_seq.txt",sep = "\t")
data_dengue_seq.head()

Unnamed: 0,ID-NUM,SEQ
0,P59036,MRQGCKFRGSSQKIRWSRSPPSSLLHTLRPRLLSAEITLQTNLPLQ...
1,Q6X4T0,MAQHPCQDQEQKVEMTSKQQRSTSIEETMRPQEKQVTITETLWDQV...
2,Q9HBA9,MGGSAPPDSSWRGSLKVSYNVGPGFTGNFSTQKVKMHIHSTNEVTR...
3,P00533,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
4,Q9BPY8,MSAETASGPTEDQVEILEYNFNKVDKHPDSTTLCLIAAEAGLSEEE...


In [43]:
data_dengue_seq_dct = {}
for index,row in data_dengue_seq.iterrows():
    
    if len(row.SEQ) <= 1024:
        data_dengue_seq_dct[row['ID-NUM']] = row.SEQ
    else:
        continue

In [44]:
def return_seq(id):
    try:
        return data_dengue_seq_dct[id]
    except:
        return np.nan

In [47]:
data_dengue["protein1_seq"] = data_dengue['PRO1'].apply(return_seq)
data_dengue["protein2_seq"] = data_dengue['PRO2'].apply(return_seq)
data_dengue.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=True)
data_dengue.dropna(inplace = True)
data_dengue

Unnamed: 0,PRO1,PRO2,protein1_seq,protein2_seq
0,Q8NBQ5,P29990-PRO_0000037929,MKFLLDILLLLPLLIVCSLESFVKLFIPKRRKSVTGEIVLITGAGH...,DSGCVVSWKNKELKCGSGIFITDNVHTWTEQYKFQPESPSKLASAI...
1,Q6DD87,P29991-PRO_0000037946,MELREEAWSPGPLDSEDQQMASHENPVDILIMDDDDVPSWPPTKLS...,VTGNIGETLGEKWKSRLNALGKSEFQIYKKSGIQEVDRTLAKEGIK...
2,P35222,P29991-PRO_0000037941,MATQADLMELDMAMEPDRKAAVSHWQQQSYLDSGIHSGATTTAPSL...,GHGQVDNFSLGVLGMALFLEEMLRTRVGTKHAILLVAVSFVTLIIG...
3,P17858,P29990-PRO_0000037929,MAAVDLEKLRASGAGKAIGVLTSGGDAQGMNAAVRAVTRMGIYVGA...,DSGCVVSWKNKELKCGSGIFITDNVHTWTEQYKFQPESPSKLASAI...
4,P51398,P29991-PRO_0000037936,MMLKGITRLISRIHKLDPGRFLHMGTQARQSIAAHLDNQVPVESPR...,MNDQRKEAKNTPFNMLKRERNRVSTVQQLTKRFSLGMLQGRGPLKL...
...,...,...,...,...
10191,A0A1B0GTU1,Q58HT7-PRO_0000268103,MPNQGEDCYFFFYSTCTKGDSCPFRHCEAALGNETVCTLWQEGRCF...,MNQRKKVVRPPFNMLKRERNRVSTPQGLVKRFSTGLFSGKGPLRMV...
10192,Q9BV79,P29990-PRO_0000037934,MWVCSTLWRVRTPARQWRGLLPASGCHGPAASSYSASAEPARVRAL...,NEMGFLEKTKKDLGLGSIATQQPESNILDIDLRPASAWTLYAVATT...
10193,P52565,P29991-PRO_0000037940,MAEQEPTAEQLAQIAAENEEDEHSVNYKPPAQKSIQEIQELDKDDE...,DSGCVVSWKNKELKCGSGIFITDNVHTWTEQYKFQPESPSKLASAI...
10195,Q8WTU2,P14340-PRO_0000037966,MHKEAEMLIGPQLDEKRWGWRLGDGSAAPPFLPQALSFLLLLPLAS...,SLTLNLITEMGRLPTFMTQKARDALDNLAVLHTAEAGGRAYNHALS...


### 2.2 Exploratory Data Analysis for Dengue-Human

In [51]:
print("Number of unique Prairs: ", len(data_dengue)) 
print("Number of unique Proteins: ", len(set(data_dengue['PRO1']).union(set(data_dengue['PRO2']))) )

Number of unique Prairs:  10197
Number of unique Proteins:  8028
