In [1]:
from collections import defaultdict

from utils import *

In [9]:
n = 47538

In [10]:
def compute_number_of_linear_subpeptides(n):
    cyclics = 2+n*(n-1)
    
    def iterative_sum(k):
        cnt = 0
        for i in range(1,k):
            cnt += i
        return cnt
    
    cyclics -= iterative_sum(n-1)
    
    return cyclics

In [11]:
compute_number_of_linear_subpeptides(n)

1129954492

In [2]:
AA2MASS = load_mass()
MASSES = sorted(set(AA2MASS.values()))
print(MASSES)

In [83]:
def cyclic_spectrum_from_aa_mass(peptide_masses):
    prefix_mass = [0]
    for i, m in enumerate(peptide_masses):
        prefix_mass.append(prefix_mass[i]+m)
    spectrum = [0]    
    for i in range(len(peptide_masses)):
        for j in range(i+1,len(peptide_masses)+1):
            m = prefix_mass[j]-prefix_mass[i]
            spectrum.append(m)
            if i>0 and j<len(peptide_masses):
                spectrum.append(prefix_mass[-1]-m)
    return sorted(spectrum)

def linear_spectrum_from_aa_mass(peptide_masses):
    prefix_mass = [0]
    for i, m in enumerate(peptide_masses):
        prefix_mass.append(prefix_mass[i]+m)
    spectrum = [0]    
    for i in range(len(peptide_masses)):
        for j in range(i+1,len(peptide_masses)+1):
            m = prefix_mass[j]-prefix_mass[i]
            spectrum.append(m)
    return sorted(spectrum)

In [84]:
def expand_candidates(candidates, masses=MASSES):
    new_candidates = list()
    for candidate in candidates:
        for m in masses:
            new_candidates.append(candidate + [m])
    return new_candidates
    
def is_consistent(candidate, spectrum):
    """
    spectrum as dict of mass to counts
    """
    candidate_spectrum = linear_spectrum_from_aa_mass(candidate)
    candidate_counts = defaultdict(int)
    for m in candidate_spectrum:
        if m not in spectrum:
            return False
        candidate_counts[m] += 1
        if candidate_counts[m] > spectrum[m]:
            return False
    return True

def cyclopeptide_sequencing(spectrum, masses=MASSES):
    parent_mass = spectrum[-1]
    spectrum_counts = defaultdict(int)
    for i in spectrum:
        spectrum_counts[i] += 1
    
    candidate_peptides = [[]]
    final_peptides = set()
    while candidate_peptides:
        candidate_peptides = expand_candidates(candidate_peptides)
        remaining_candidates = []
        for candidate_peptide in candidate_peptides:
            peptide_mass = sum(candidate_peptide)
            if peptide_mass == parent_mass:
                peptide_string = "-".join((str(m) for m in candidate_peptide))
                if cyclic_spectrum_from_aa_mass(candidate_peptide) and peptide_string not in final_peptides:
                    final_peptides.add(peptide_string)
            elif is_consistent(candidate_peptide, spectrum_counts):    
                remaining_candidates.append(candidate_peptide)
        candidate_peptides = remaining_candidates
    return final_peptides

In [85]:
spectrum = [0, 113, 128, 186, 241, 299, 314, 427]
expected = {"186-128-113", "186-113-128", "128-186-113", "128-113-186", "113-186-128", "113-128-186"}

In [86]:
final_peptides = cyclopeptide_sequencing(spectrum)

In [87]:
assert expected == final_peptides

In [92]:
spectrum = [0, 71, 97, 99, 103, 113, 113, 114, 115, 131, 137, 196, 200, 202, 208, 214, 226, 227, 228, 240, 245, 299, 311, 311, 316, 327, 337, 339, 340, 341, 358, 408, 414, 424, 429, 436, 440, 442, 453, 455, 471, 507, 527, 537, 539, 542, 551, 554, 556, 566, 586, 622, 638, 640, 651, 653, 657, 664, 669, 679, 685, 735, 752, 753, 754, 756, 766, 777, 782, 782, 794, 848, 853, 865, 866, 867, 879, 885, 891, 893, 897, 956, 962, 978, 979, 980, 980, 990, 994, 996, 1022, 1093]
expected = set([
    "103-137-71-131-114-113-113-115-99-97", 
    "103-97-99-115-113-113-114-131-71-137", 
    "113-113-114-131-71-137-103-97-99-115", 
    "113-113-115-99-97-103-137-71-131-114", 
    "113-114-131-71-137-103-97-99-115-113", 
    "113-115-99-97-103-137-71-131-114-113", 
    "114-113-113-115-99-97-103-137-71-131", 
    "114-131-71-137-103-97-99-115-113-113", 
    "115-113-113-114-131-71-137-103-97-99", 
    "115-99-97-103-137-71-131-114-113-113", 
    "131-114-113-113-115-99-97-103-137-71", 
    "131-71-137-103-97-99-115-113-113-114", 
    "137-103-97-99-115-113-113-114-131-71", 
    "137-71-131-114-113-113-115-99-97-103", 
    "71-131-114-113-113-115-99-97-103-137", 
    "71-137-103-97-99-115-113-113-114-131", 
    "97-103-137-71-131-114-113-113-115-99", 
    "97-99-115-113-113-114-131-71-137-103", 
    "99-115-113-113-114-131-71-137-103-97", 
    "99-97-103-137-71-131-114-113-113-115"])
final_peptides = cyclopeptide_sequencing(spectrum)

In [93]:
assert final_peptides == expected

In [96]:
with open("../data/dataset_100_6-2.txt") as fin:
    spectrum = [int(m.strip()) for m in fin.read().split(" ")]
    final_peptides = cyclopeptide_sequencing(spectrum)
    print(" ".join(sorted(final_peptides)))

114-129-147-97-131-186-115-156 114-156-115-186-131-97-147-129 115-156-114-129-147-97-131-186 115-186-131-97-147-129-114-156 129-114-156-115-186-131-97-147 129-147-97-131-186-115-156-114 131-186-115-156-114-129-147-97 131-97-147-129-114-156-115-186 147-129-114-156-115-186-131-97 147-97-131-186-115-156-114-129 156-114-129-147-97-131-186-115 156-115-186-131-97-147-129-114 186-115-156-114-129-147-97-131 186-131-97-147-129-114-156-115 97-131-186-115-156-114-129-147 97-147-129-114-156-115-186-131


In [98]:
### quiz

In [97]:
AA2MASS["Q"]

128

In [103]:
mass2aa = {str(v): k for k, v in AA2MASS.items()}
mass2aa

{'57': 'G',
 '71': 'A',
 '87': 'S',
 '97': 'P',
 '99': 'V',
 '101': 'T',
 '103': 'C',
 '113': 'L',
 '114': 'N',
 '115': 'D',
 '128': 'Q',
 '129': 'E',
 '131': 'M',
 '137': 'H',
 '147': 'F',
 '156': 'R',
 '163': 'Y',
 '186': 'W'}

In [105]:
spectrum = [0, 71, 101, 113, 131, 184, 202, 214, 232, 285, 303, 315, 345, 416]
for p in cyclopeptide_sequencing(spectrum):
    print("".join([mass2aa[m] for m in p.split("-")]))

LTMA
ALTM
AMTL
TLAM
MALT
MTLA
TMAL
LAMT


In [117]:
spectrum = [0, 71, 99, 101, 103, 128, 129, 199, 200, 204, 227, 230, 231, 298, 303, 328, 330, 332, 333]
spectrum_counts = defaultdict(int)
for s in spectrum:
    spectrum_counts[s] += 1

In [108]:
peptide2mass = lambda s: [AA2MASS[aa] for aa in s]

In [120]:
for p in ["QCV","VAQ","CTV","TCE","TCQ","AQV"]:
    print(p, is_consistent(peptide2mass(p),spectrum_counts))

QCV False
VAQ False
CTV True
TCE False
TCQ True
AQV True


In [3]:
from utils import *
peptide = peptide2mass("NQEL",AA2MASS)

In [4]:
get_spectrum(peptide, cyclic=False)

[0, 113, 114, 128, 129, 242, 242, 257, 370, 371, 484]