# Comparing proteomes

In [30]:
import numpy as np
import pandas as pd
import string
import editdistance

In [124]:
n1 = 3000
n2 = 3000
protein_min = 20
protein_max = 100
name_len = 10

# make up some fake protein names and thrir sequences
proteome1 = np.array([''.join(np.random.choice(list('ATCG'), n)) for n in np.random.randint(protein_min, protein_max+1, n1)])
proteome2 = np.array([''.join(np.random.choice(list('ATCG'), n)) for n in np.random.randint(protein_min, protein_max+1, n2)])
names1 = [''.join(np.random.choice(list(string.ascii_lowercase), name_len)) for n in range(n1)]
names2 = [''.join(np.random.choice(list(string.ascii_lowercase), name_len)) for n in range(n2)]

# threshold for differnces
threshold = 4
# make 2/3 of proteins to be identical or almost identical in two proteomes
idx = np.random.randint(0, 3, n1)[:n2]
proteome2[idx<=1] = proteome1[idx<=1]
for i in np.nonzero(idx==2)[0]:
    s = proteome1[i]
    # either delete, insert or replace first k letters
    k = np.random.randint(1, threshold)
    r = np.random.rand()
    if r < 0.2: # deletion
        s = s[k:]
    elif r < 0.4: # insertion
        s = 'X'*k + s
    else: # mutation
        for j in range(k):
            t = list(s)
            t[j] = 'X'
            s = ''.join(t)
    proteome2[i] = s

d1 = dict(zip(names1, proteome1))
d2 = dict(zip(names2, proteome2))

In [125]:
%%time
pdist = []
for p1 in d1:
    for p2 in d2:
        pdist.append((p1, p2, editdistance.eval(d1[p1], d2[p2])))

CPU times: user 17.4 s, sys: 150 ms, total: 17.6 s
Wall time: 17.6 s


In [126]:
df = pd.DataFrame(pdist, columns=['p1', 'p2', 'd'])

In [127]:
df.shape

(9000000, 3)

In [128]:
df = df.sort_values('d')

In [129]:
threshold

4

In [130]:
for i in range(threshold):
    print(i, (df.d == i).sum())

0 2026
1 297
2 349
3 325


In [132]:
summary = df.groupby('d').count()
summary[:10]

Unnamed: 0_level_0,p1,p2
d,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2026,2026
1,297,297
2,349,349
3,325,325
4,3,3
6,2,2
7,8,8
8,48,48
9,209,209
10,860,860
