### Importing Packages

In [1]:
import itertools
import numpy as np
from Bio import Align
from Bio import SeqIO
from Bio import pairwise2
from Bio.SubsMat.MatrixInfo import blosum62
import csv
import pandas as pd


### Profiling and meaurment of execution time

In [2]:
import time
from functools import wraps

PROF_DATA = {}

def profile(fn):
    @wraps(fn)
    def with_profiling(*args, **kwargs):
        start_time = time.time()

        ret = fn(*args, **kwargs)

        elapsed_time = time.time() - start_time

        if fn.__name__ not in PROF_DATA:
            PROF_DATA[fn.__name__] = [0, []]
        PROF_DATA[fn.__name__][0] += 1
        PROF_DATA[fn.__name__][1].append(elapsed_time)

        return ret

    return with_profiling

def print_prof_data():
    for fname, data in PROF_DATA.items():
        max_time = max(data[1])
        avg_time = sum(data[1]) / len(data[1])
        print("Function %s called %d times. " % (fname, data[0]))
        print('Execution time max: %.3f, average: %.3f' % (max_time, avg_time))

def clear_prof_data():
    global PROF_DATA
    PROF_DATA = {}

In [3]:
@profile
def PrintName():
    print("name")
    
PrintName()
print_prof_data()

name
Function PrintName called 1 times. 
Execution time max: 0.000, average: 0.000


### Reading fast files of protein sequence

In [3]:

query_seq = list(SeqIO.parse("NP_pSeq_multiple_Interactions2.fasta", "fasta"))
target_seq = list(SeqIO.parse("uniprot-filtered-organism__Homo+sapiens+(Human)+[9606]_+AND+review--.fasta", "fasta"))



In [4]:
print(len(target_seq))
print(target_seq[0])

20431
ID: sp|A0PJZ0|A20A5_HUMAN
Name: sp|A0PJZ0|A20A5_HUMAN
Description: sp|A0PJZ0|A20A5_HUMAN Putative ankyrin repeat domain-containing protein 20A5 OS=Homo sapiens OX=9606 GN=ANKRD20A5P PE=5 SV=1
Number of features: 0
Seq('MKLFGFRSRRGQTVLGSIDHLYTGSGYRIRYSELQKIHKAAVKGDAAEMERCLA...DKV', SingleLetterAlphabet())


### Creating Empty CSV file to store the similarity results

#### Calculating similarity score and saving in csv

In [6]:
@profile
def calculate_similarity(query_seq,target_seq):
    print("started task...")
    with open("similar_proteins2.csv", "w") as csvFile:
        fieldnames = ['Query_Seq_ID','Query_Seq_Name', 'Query_Seq_Description', 'Query_Seq_Number_of_features', \
                      'Query_Seq','target_Seq_ID', 'target_Seq_Name', 'target_Seq_Description', 'target_Seq_Number_of_features',\
                      'target_Seq', 'Align_Score']
        writer = csv.DictWriter(csvFile, fieldnames=fieldnames)
        writer.writeheader()
        for seq_q in query_seq:
            try:
                
                for seq_t in target_seq:
                    try:

                        align_score = pairwise2.align.localds(seq_q.seq, seq_t.seq, blosum62, -10, -0.5,score_only=True,one_alignment_only=True)
                        writer.writerow({'Query_Seq_ID': seq_q.id.split('|')[1], 'Query_Seq_Name':seq_q.name, \
                                         'Query_Seq_Description':seq_q.description, 'Query_Seq_Number_of_features':len(seq_q.features),\
                                         'Query_Seq':seq_q.seq,\
                                         'target_Seq_ID': seq_t.id.split('|')[1], 'target_Seq_Name':seq_t.name, \
                                         'target_Seq_Description':seq_t.description, 'target_Seq_Number_of_features':len(seq_t.features),\
                                         'target_Seq':seq_t.seq, 'Align_Score': align_score})
                    except Exception as e:
                        print(e)
                        continue # doing nothing on exception
            except Exception as e:
                    print(e)
                    continue # doing nothing on exception

    print("Task Completed.....")






In [7]:
print("started....")
calculate_similarity(query_seq,target_seq)
print("completed.....")
print_prof_data()

started....
started task...
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in fu

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fast> returned a result with an error set
<built-in function _make_score_matrix_fa

In [None]:
#aligner = Align.PairwiseAligner()
#aligner.substitution_matrix = blosum62
#aligner.mode = 'local'
#aligner.open_gap_score = -10
#aligner.extend_gap_score = -0.5
#alignments = alignelr.align(query_seq[0].seq, target_seq[0].seq)

#print(alignments[0].score)
#print(aligner)

In [2]:
df=pd.read_csv('similar_proteins2.csv')
df.shape

(612180, 11)

In [3]:
df.nunique()

Query_Seq_ID                        30
Query_Seq_Name                      30
Query_Seq_Description               30
Query_Seq_Number_of_features         1
Query_Seq                           30
target_Seq_ID                    20406
target_Seq_Name                  20406
target_Seq_Description           20406
target_Seq_Number_of_features        1
target_Seq                       20343
Align_Score                        780
dtype: int64

In [4]:
df2=pd.read_csv('similar_proteins.csv', encoding='utf8',engine='python')
df2.shape

(618848, 11)

In [5]:
df2.nunique()

Query_Seq_ID                        31
Query_Seq_Name                      31
Query_Seq_Description               31
Query_Seq_Number_of_features         1
Query_Seq                           31
target_Seq_ID                    20406
target_Seq_Name                  20406
target_Seq_Description           20406
target_Seq_Number_of_features        1
target_Seq                       20343
Align_Score                       1091
dtype: int64

In [6]:
df2.to_csv('similar_proteins1.csv')

In [9]:
df3=pd.read_csv('similar_proteins1.csv')
df3.shape

(618848, 12)

In [10]:
df=df[df['Align_Score']>=30]

df.shape

(549630, 11)

In [11]:
df2=df2[df2['Align_Score']>=30]

df2.shape

(576476, 11)

In [12]:
df.nunique()

Query_Seq_ID                        30
Query_Seq_Name                      30
Query_Seq_Description               30
Query_Seq_Number_of_features         1
Query_Seq                           30
target_Seq_ID                    20372
target_Seq_Name                  20372
target_Seq_Description           20372
target_Seq_Number_of_features        1
target_Seq                       20309
Align_Score                        740
dtype: int64

In [13]:
df2.nunique()

Query_Seq_ID                        31
Query_Seq_Name                      31
Query_Seq_Description               31
Query_Seq_Number_of_features         1
Query_Seq                           31
target_Seq_ID                    20376
target_Seq_Name                  20376
target_Seq_Description           20376
target_Seq_Number_of_features        1
target_Seq                       20313
Align_Score                       1051
dtype: int64

In [14]:
df_pr_sim = pd.concat([df2,df],ignore_index=True)
df_pr_sim.head()

Unnamed: 0,Query_Seq_ID,Query_Seq_Name,Query_Seq_Description,Query_Seq_Number_of_features,Query_Seq,target_Seq_ID,target_Seq_Name,target_Seq_Description,target_Seq_Number_of_features,target_Seq,Align_Score
0,P17735,sp|P17735|ATTY_HUMAN,sp|P17735|ATTY_HUMAN Tyrosine aminotransferase...,0,MDPYMIQMSSKGNLPSILDVHVNVGGRSSVPGKMKGRKARWSVRPS...,A0PJZ0,sp|A0PJZ0|A20A5_HUMAN,sp|A0PJZ0|A20A5_HUMAN Putative ankyrin repeat ...,0,MKLFGFRSRRGQTVLGSIDHLYTGSGYRIRYSELQKIHKAAVKGDA...,40.0
1,P17735,sp|P17735|ATTY_HUMAN,sp|P17735|ATTY_HUMAN Tyrosine aminotransferase...,0,MDPYMIQMSSKGNLPSILDVHVNVGGRSSVPGKMKGRKARWSVRPS...,P18825,sp|P18825|ADA2C_HUMAN,sp|P18825|ADA2C_HUMAN Alpha-2C adrenergic rece...,0,MASPALAAALAVAAAAGPNASGAGERGSGGVANASGASWGPPRGQY...,36.5
2,P17735,sp|P17735|ATTY_HUMAN,sp|P17735|ATTY_HUMAN Tyrosine aminotransferase...,0,MDPYMIQMSSKGNLPSILDVHVNVGGRSSVPGKMKGRKARWSVRPS...,Q9NZK5,sp|Q9NZK5|ADA2_HUMAN,sp|Q9NZK5|ADA2_HUMAN Adenosine deaminase 2 OS=...,0,MLVDGPSERPALCFLLLAVAMSFFGSALSIDETRAHLLLKEKMMRL...,44.5
3,P17735,sp|P17735|ATTY_HUMAN,sp|P17735|ATTY_HUMAN Tyrosine aminotransferase...,0,MDPYMIQMSSKGNLPSILDVHVNVGGRSSVPGKMKGRKARWSVRPS...,Q99424,sp|Q99424|ACOX2_HUMAN,sp|Q99424|ACOX2_HUMAN Peroxisomal acyl-coenzym...,0,MGSPVHRVSLGDTWSRQMHPDIESERYMQSFDVERLTNILDGGAQN...,49.0
4,P17735,sp|P17735|ATTY_HUMAN,sp|P17735|ATTY_HUMAN Tyrosine aminotransferase...,0,MDPYMIQMSSKGNLPSILDVHVNVGGRSSVPGKMKGRKARWSVRPS...,Q8WW27,sp|Q8WW27|ABEC4_HUMAN,sp|Q8WW27|ABEC4_HUMAN Putative C->U-editing en...,0,MEPIYEEYLANHGTIVKPYYWLSFSLDCSNCPYHIRTGEEARVSLT...,54.0


In [15]:
df_pr_sim.nunique()

Query_Seq_ID                        61
Query_Seq_Name                      61
Query_Seq_Description               61
Query_Seq_Number_of_features         1
Query_Seq                           61
target_Seq_ID                    20380
target_Seq_Name                  20380
target_Seq_Description           20380
target_Seq_Number_of_features        1
target_Seq                       20317
Align_Score                       1152
dtype: int64

In [2]:
#df_pr_sim.to_csv("protiens_similiarity_clean.csv")

df_pro = pd.read_csv("protiens_similiarity_clean.csv")


In [3]:
df_pro.shape

(1126106, 12)

In [4]:
df_pro['Align_Score'].max()

10502.0