In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 28 17:26:09 2019
@author: jasonmoggridge

BA4_G - Leaderboard cyclopeptide sequencing

    *** Adapting earlier algos to accept not perfect matching
    substrings, but still want a strict bound on possible solutions
    carried forward after each expansion of candidates.
    
    -> Leaderboard, the N-best candidates
    or 19 + multiple if tie for 20th
    
    
"""
#   # Pseudocode for LeaderboardSeqng(Spectrum, N):
#
#
#LEADERBOARDCYCLOPEPTIDESEQUENCING(Spectrum, N)
#    Leaderboard ← {0-peptide}
#    LeaderPeptide ← 0-peptide
#    while Leaderboard is non-empty
#        Leaderboard ← Expand(Leaderboard)
#        for each Peptide in Leaderboard
#            if Mass(Peptide) = ParentMass(Spectrum)
#                if Score(Peptide, Spectrum) > Score(LeaderPeptide, Spectrum)
#                    LeaderPeptide ← Peptide
#            else if Mass(Peptide) > ParentMass(Spectrum)
#                remove Peptide from Leaderboard
#        Leaderboard ← Cut(Leaderboard, Spectrum, N)
#    output LeaderPeptide



def Leaderboard_Cyclopeptide_Sequencing(Spectrum, N):
    
    
    def Expand_Peptides(peptides):
    
        new_peptides = []
        for peptide in peptides:
            for aa in mass:
                new_peptides.append(peptide + aa)
    
        return new_peptides
    #
    
    def Cyclo_Spectrum(Peptide):
    
        if Peptide in cyclospectra:
            return cyclospectra[Peptide]
        else:
            spectrum = [0, int(sum(mass[aa] for aa in Peptide))]    
            cycle = Peptide * 2
            for i in range( len( Peptide )):
                for j in range( i + 1, i + len( Peptide )):
                    spectrum.append( sum( mass[aa] for aa in cycle[i:j] ))            
    
            cyclospectra[Peptide] = spectrum            
            return cyclospectra[Peptide]
    #
        
    def Score(Peptide, Spectrum): 
       
        peaks = list(Spectrum)
        score = 0    
        
        for fragment in Cyclo_Spectrum(Peptide):     
            if fragment in peaks:
                score += 1
                peaks.remove(fragment)         
            elif fragment > Parent_Mass:
                return 0
        return score
    #
        
    cyclospectra = {}
    Parent_Mass = Spectrum[-1]     
    
    Leaderboard = ['']
    LeaderPeptide = ''
    
    while Leaderboard:

        Leaderboard = Expand_Peptides(Leaderboard)
        scores = []        

        for Peptide in Leaderboard:

            score = Score(Peptide, Spectrum)    
            Mass_Peptide = max(Cyclo_Spectrum(Peptide))

            if Mass_Peptide == Parent_Mass and score > Score(LeaderPeptide, Spectrum):
                LeaderPeptide = Peptide

            elif Mass_Peptide > Parent_Mass:
                score = 0

            scores.append(score)  

        if len(Leaderboard) > N:

            cut_off = sorted(scores)[-N]
            leaders = []

            for i in range(len(Leaderboard)):
                if scores[i] >= cut_off and scores[i] > 0:
                    leaders.append(Leaderboard[i])

            Leaderboard = leaders 
    
    return LeaderPeptide

#######

mass = {
        'G':57,'A':71,'S':87,'P':97,'V':99,'T':101,'C':103,\
        'I':113,'N':114,'D':115,'E':129,'K':128, 'M':131,'H':137,\
        'F':147,'R': 156,'Y': 163,'W': 186
        }
#######

# Unit Test - Leaderboard Cyclopeptide Sequencing Problem
# Expected Output (113-147-71-129)
spectrum = "0 71 113 129 147 200 218 260 313 331 347 389 460"
n = 10

spectrum = [int(x) for x in spectrum.split()]
test_result = Leaderboard_Cyclopeptide_Sequencing(spectrum, n)
print('Leaderboard Result =', test_result)

# Convert to Mass Values
masses = [str(mass[aa]) for aa in test_result]
print('-'.join(masses))


Leaderboard Result = AFIE
71-147-113-129


In [3]:
# Stepik Test - Leaderboard Cyclopeptide Sequencing Problem
spectrum = "0 71 87 97 103 103 103 113 113 113 113 114 114 128 128 129 129 129 129 131 131 137 147 163 174 200 200 206 215 216 218 226 234 241 242 242 243 243 245 250 250 258 260 260 260 271 276 291 319 321 328 337 344 347 355 356 363 363 363 363 371 372 374 374 378 379 388 389 407 418 434 441 447 450 450 450 458 466 475 476 478 484 484 491 491 492 492 493 503 503 503 521 547 561 562 563 578 578 578 579 581 581 587 588 592 595 597 604 605 606 606 621 632 634 649 650 678 690 691 691 692 692 695 700 701 707 708 709 709 710 719 721 734 734 737 741 762 763 779 792 796 803 804 806 820 821 821 822 824 836 837 838 838 840 841 847 847 850 854 866 875 909 910 921 925 934 934 934 940 941 950 950 950 950 951 953 953 955 960 969 969 981 983 1012 1013 1022 1038 1047 1050 1053 1054 1056 1062 1063 1063 1064 1071 1079 1080 1081 1082 1083 1084 1084 1084 1097 1116 1125 1150 1151 1159 1166 1169 1170 1175 1181 1183 1184 1187 1187 1192 1193 1194 1195 1200 1210 1212 1213 1225 1229 1253 1254 1272 1278 1282 1283 1284 1288 1290 1297 1297 1299 1300 1306 1313 1322 1324 1324 1329 1338 1341 1344 1357 1368 1381 1384 1387 1396 1401 1401 1403 1412 1419 1425 1426 1428 1428 1435 1437 1441 1442 1443 1447 1453 1471 1472 1496 1500 1512 1513 1515 1525 1530 1531 1532 1533 1538 1538 1541 1542 1544 1550 1555 1556 1559 1566 1574 1575 1600 1609 1628 1641 1641 1641 1642 1643 1644 1645 1646 1654 1661 1662 1662 1663 1669 1671 1672 1675 1678 1687 1703 1712 1713 1742 1744 1756 1756 1765 1770 1772 1772 1774 1775 1775 1775 1775 1784 1785 1791 1791 1791 1800 1804 1815 1816 1850 1859 1871 1875 1878 1878 1884 1885 1887 1887 1888 1889 1901 1903 1904 1904 1905 1919 1921 1922 1929 1933 1946 1962 1963 1984 1988 1991 1991 2004 2006 2015 2016 2016 2017 2018 2024 2025 2030 2033 2033 2034 2034 2035 2047 2075 2076 2091 2093 2104 2119 2119 2120 2121 2128 2130 2133 2137 2138 2144 2144 2146 2147 2147 2147 2162 2163 2164 2178 2204 2222 2222 2222 2232 2233 2233 2234 2234 2241 2241 2247 2249 2250 2259 2267 2275 2275 2275 2278 2284 2291 2307 2318 2336 2337 2346 2347 2351 2351 2353 2354 2362 2362 2362 2362 2369 2370 2378 2381 2388 2397 2404 2406 2412 2434 2449 2454 2465 2465 2465 2467 2475 2475 2480 2482 2482 2483 2483 2484 2491 2499 2507 2509 2510 2519 2525 2525 2551 2562 2578 2588 2594 2594 2596 2596 2596 2596 2597 2597 2611 2611 2612 2612 2612 2612 2622 2622 2622 2628 2638 2654 2725"
n = 217

spectrum = [int(x) for x in spectrum.split()]
test_result = Leaderboard_Cyclopeptide_Sequencing(spectrum, n)
print('Leaderboard Result =', test_result)

# Convert to Mass Values
masses = [str(mass[aa]) for aa in test_result]
print('-'.join(masses))

Leaderboard Result = ICCMEENMEFACPYKSIIHIENK
113-103-103-131-129-129-114-131-129-147-71-103-97-163-128-87-113-113-137-113-129-114-128
