## Python Imports

In [60]:
import numpy as np
import math
from pprint import pprint
import argparse

%matplotlib inline

## Adding Argparse

In [61]:
def main():

    parser = argparse.ArgumentParser(description='PSSM generation')

    parser.add_argument('-p', action='store', type=str, help='Path to the input peptides file', required=True)
    parser.add_argument('-w', action='store_true', help='Sequence weighting')
    parser.add_argument('-b', action='store', type=int, help='Weight on pseduo frequency', default=50)
    parser.add_argument('-o', action='store', type=str, help='Path to the output PSSM file', required=True)


    args = parser.parse_args()
    
    peptides_file = args.p
    beta = args.b
    output_file = args.o

    if args.w:
        weighting = True
    else:
        weighting = False

## Parameters used for testing

In [62]:
peptides_file = '../data/data/PSSM/A0201.large_lig'
# peptides_file = './evalutation_pepetides.txt'
output_file = './test_out'
beta = 50
weighting = True

## Information needed for the script

In [63]:
amino_acids = 'ARNDCQEGHILKMFPSTWYV'

## Creating dictionary containing background frequencies of amino acids

In [64]:
background_freq_file = "../data/data/Matrices/bg.freq.fmt"

bg = np.loadtxt(background_freq_file, dtype=float)

background_freq = dict()

for i, amino_acid in enumerate(amino_acids):
    background_freq[amino_acid] = bg[i]

## Load BLOSUM matrix

In [65]:
blosum62_file = "../data/data/Matrices/blosum62.freq_rownorm"

_blosum62 = np.loadtxt(blosum62_file, dtype=float).T

blosum62 = dict()

for i, amino_acid in enumerate(amino_acids):
        
        blosum62[amino_acid] = dict()

        for j, amino_acid2 in enumerate(amino_acids):
            
            blosum62[amino_acid][amino_acid2] = _blosum62[i, j]

blosum62

{'A': {'A': 0.2901,
  'R': 0.0446,
  'N': 0.0427,
  'D': 0.041,
  'C': 0.065,
  'Q': 0.0559,
  'E': 0.0552,
  'G': 0.0783,
  'H': 0.042,
  'I': 0.0471,
  'L': 0.0445,
  'K': 0.057,
  'M': 0.0522,
  'F': 0.0338,
  'P': 0.0568,
  'S': 0.1099,
  'T': 0.073,
  'W': 0.0303,
  'Y': 0.0405,
  'V': 0.07},
 'R': {'A': 0.031,
  'R': 0.345,
  'N': 0.0449,
  'D': 0.0299,
  'C': 0.0163,
  'Q': 0.0735,
  'E': 0.0497,
  'G': 0.0229,
  'H': 0.0458,
  'I': 0.0177,
  'L': 0.0243,
  'K': 0.1071,
  'M': 0.0321,
  'F': 0.019,
  'P': 0.0258,
  'S': 0.0401,
  'T': 0.0355,
  'W': 0.0227,
  'Y': 0.028,
  'V': 0.0219},
 'N': {'A': 0.0256,
  'R': 0.0388,
  'N': 0.3169,
  'D': 0.069,
  'C': 0.0163,
  'Q': 0.0441,
  'E': 0.0405,
  'G': 0.0391,
  'H': 0.0534,
  'I': 0.0147,
  'L': 0.0142,
  'K': 0.0415,
  'M': 0.0201,
  'F': 0.0169,
  'P': 0.0233,
  'S': 0.0541,
  'T': 0.0434,
  'W': 0.0152,
  'Y': 0.0218,
  'V': 0.0165},
 'D': {'A': 0.0297,
  'R': 0.031,
  'N': 0.0831,
  'D': 0.3974,
  'C': 0.0163,
  'Q': 0.0471,


## Load peptides

In [66]:
peptides = np.loadtxt(peptides_file, dtype=str).tolist()

peptide_length = len(peptides[0])

# Assert that all peptides are of the same length
for peptide in peptides:
    if len(peptide) != peptide_length:
        raise AssertionError("Peptides are not of the same length")


## Intialize Matrix

In [67]:
# Create a matrix of zeros with dimensions peptide_length x 20

def initialize_matrix(peptide_length, amino_acids):
    matrix = [0]*peptide_length 

    for i in range(peptide_length):
        matrix[i] = dict()

        for amino_acid in amino_acids:
            matrix[i][amino_acid] = 0.0
    
    return matrix

## Create count matrix of amino acids 

In [68]:
def count_aa(peptides, peptide_length, amino_acids):

    c_matrix = initialize_matrix(peptide_length, amino_acids)

    for peptide in peptides:
        for i, amino_acid in enumerate(peptide):
            c_matrix[i][amino_acid] += 1.0
            
    return c_matrix

c_matrix = count_aa(peptides, peptide_length, amino_acids)

print(c_matrix[0])

{'A': 27.0, 'R': 13.0, 'N': 2.0, 'D': 4.0, 'C': 3.0, 'Q': 4.0, 'E': 5.0, 'G': 19.0, 'H': 3.0, 'I': 17.0, 'L': 24.0, 'K': 14.0, 'M': 9.0, 'F': 18.0, 'P': 3.0, 'S': 26.0, 'T': 7.0, 'W': 1.0, 'Y': 12.0, 'V': 18.0}


## Construct weight matrix

In [69]:
# w = 1 / r * s
# where 
# r = number of different amino acids in position i
# s = number of occurrences of the amino acid in the current peptide at the current position
# neff = effective number of sequences

# Weighting of each peptide ensures that peptides with more unique amino acids are weighted more heavily

def construct_weight_matrix(peptides, amino_acids, weighting):
    weight_matrix = {}

    if weighting:


        for peptide in peptides:
            
            w = 0
            neff = 0

            # For each peptide, assign the weight for each position as 1 / (r * s)
            for pos in range(len(peptide)):
                
                r = 0

                for amino_acid in amino_acids:

                    if c_matrix[pos][amino_acid] != 0:
                        r += 1

                s = c_matrix[pos][peptide[pos]]

                w += 1 / (r * s)

                
                neff += r 

            # Effective number of sequences, average number of different amino acids in a position
            neff /= len(peptide)

            weight_matrix[peptide] = w
    
    else:
        for peptide in peptides:
            weight_matrix[peptide] = 1.0

        neff = len(peptides)

    return weight_matrix, neff

weight_matrix, neff = construct_weight_matrix(peptides, amino_acids, weighting) 

pprint( "W:")
pprint( weight_matrix )
pprint( "Nseq:")
pprint( neff )

'W:'
{'AAGIGIIQI': 0.024596541281685157,
 'AAGIGILTV': 0.017794374930590016,
 'ACDPHSGHF': 0.11606701877986708,
 'AFHHVAREL': 0.16762478444633447,
 'AIMDKNIIL': 0.03675033603899132,
 'ALCRWGLLL': 0.038316106033513334,
 'ALFPQLVIL': 0.0247039614348667,
 'ALGLGLLPV': 0.01930023126571007,
 'ALIDFALYL': 0.025263578835188177,
 'ALIVGANDD': 0.100611436744236,
 'ALLPPINIL': 0.022592294366303084,
 'ALNELLQHV': 0.03213666839007289,
 'ALQPGTALL': 0.025789304539119713,
 'ALSDHHIYL': 0.03531896462586808,
 'ALSNLEVKL': 0.037730526017692625,
 'ALWGFFPVL': 0.024805842986585454,
 'ALYVDSLFF': 0.042787268993597666,
 'AQYTSRMIA': 0.10924718135995681,
 'ASKKFDQSQ': 0.09142827452618475,
 'AVFDRKSDA': 0.05167458625640195,
 'AVGIGIAVV': 0.02107323439396396,
 'AVPDEIPPL': 0.026045581915119548,
 'CINGVCWTV': 0.049096156193671595,
 'CLGGLLTMV': 0.04386581500312044,
 'CLTSTVQLV': 0.040561312437441883,
 'DLERKVESL': 0.05248292990322345,
 'DLMGYIPLV': 0.03314102718433929,
 'DTVLEEMNL': 0.05099854032009033,
 'DVKQ

## Observed frequency matrix $f_a$

In [70]:
def frequency_matrix(peptide_length, amino_acids, peptides, weight_matrix):
    f_matrix = initialize_matrix(peptide_length, amino_acids)

    for pos in range(peptide_length):

        n = 0 # Total weight for the given position

        for peptide in peptides:
            f_matrix[pos][peptide[pos]] += weight_matrix[peptide]
            n += weight_matrix[peptide]
        
        for amino_acid in amino_acids:
            f_matrix[pos][amino_acid] /= n # Normalize the frequencies

    return f_matrix

freq_matrix = frequency_matrix(peptide_length, amino_acids, peptides, weight_matrix)

pprint( freq_matrix[0] )

{'A': 0.1311811080905332,
 'C': 0.014835920403803767,
 'D': 0.020983592174332863,
 'E': 0.027946724738341398,
 'F': 0.06424458205659711,
 'G': 0.0627520000996758,
 'H': 0.015299356996921818,
 'I': 0.06970270278930987,
 'K': 0.06294576946390068,
 'L': 0.09425526545382713,
 'M': 0.0376436000084843,
 'N': 0.013196906736333046,
 'P': 0.02139872688992593,
 'Q': 0.020192724448129442,
 'R': 0.05470935070055924,
 'S': 0.09446933607455794,
 'T': 0.03891306967870277,
 'V': 0.09865737399939828,
 'W': 0.008014705876770572,
 'Y': 0.048657183319894895}


## Pseduo frequency matrix $g_a$

$g(b) = \sum f(a)* q(b|a),$ and $ blosum[a,b] = q(a|b)$


In [71]:
def pseduo_freq_matrix(peptide_length, amino_acids, freq_matrix, blosum62): 
    g_matrix = initialize_matrix(peptide_length, amino_acids)

    for pos in range(peptide_length):
        
        for amino_acid_A in amino_acids:
            for amino_acid_B in amino_acids:

                g_matrix[pos][amino_acid_B] += freq_matrix[pos][amino_acid_A] * blosum62[amino_acid_B][amino_acid_A]

    return g_matrix


pseduo_matrix = pseduo_freq_matrix(peptide_length, amino_acids, freq_matrix, blosum62)

pprint(pseduo_matrix[0])

{'A': 0.08987082139366394,
 'C': 0.021349452415678354,
 'D': 0.03809148889713515,
 'E': 0.04474699977471958,
 'F': 0.05630852882264416,
 'G': 0.07026655815709919,
 'H': 0.02212768473312947,
 'I': 0.07445249100074502,
 'K': 0.0572838265712342,
 'L': 0.10508273163109474,
 'M': 0.027841477841683355,
 'N': 0.0335719618182939,
 'P': 0.030884895328302287,
 'Q': 0.030191793024707318,
 'R': 0.05153525745245067,
 'S': 0.06340023795275433,
 'T': 0.05062316759683184,
 'V': 0.0827831198777054,
 'W': 0.011374105618283127,
 'Y': 0.03820804536044906}


## Combined frequency matrix $p_a$

$$\begin{align*}
p_a = \frac{\alpha \cdot f_a + \beta \cdot g_a}{\alpha + \beta}
\end{align*}$$

where

$\alpha = n_{eff} - 1$

In [72]:
def combined_freq_matrix(peptide_length, amino_acids, freq_matrix, pseduo_matrix, neff, beta):
    p_matrix = initialize_matrix(peptide_length, amino_acids)

    alpha = neff - 1

    for pos in range(peptide_length):
        for amino_acid in amino_acids:
            p_matrix[pos][amino_acid] = (alpha * freq_matrix[pos][amino_acid] + beta * pseduo_matrix[pos][amino_acid]) / (alpha + beta)

    return p_matrix

p_matrix = combined_freq_matrix(peptide_length, amino_acids, freq_matrix, pseduo_matrix, neff, beta)

pprint(p_matrix[0])

{'A': 0.10080589728401168,
 'C': 0.019625282177240966,
 'D': 0.033562927999922776,
 'E': 0.040299868147443005,
 'F': 0.05840924879633759,
 'G': 0.06827741043601654,
 'H': 0.02032018621472156,
 'I': 0.07319519412124749,
 'K': 0.058782576160469446,
 'L': 0.1022166376429945,
 'M': 0.03043615723877772,
 'N': 0.028178564884833678,
 'P': 0.028373850741673253,
 'Q': 0.027544980754436704,
 'R': 0.05237545860636176,
 'S': 0.07162441098499647,
 'T': 0.047523435794974145,
 'V': 0.08698512832168293,
 'W': 0.010484852745529804,
 'Y': 0.04097399364383178}


## Log odds weight ratio $W_a$

$$
\begin{align*}
W_{i,a} = 2 \cdot \frac{\log{\frac{p_{i,a}}{q_a}}}{\log{2}}
\end{align*}
$$

In [73]:
def log_odds_weight(peptide_length, amino_acids, p_matrix, background_freq):
    W_matrix = initialize_matrix(peptide_length, amino_acids)

    for pos in range(peptide_length):
        for amino_acid in amino_acids:
            
            # Prevent taking the log of 0
            if p_matrix[pos][amino_acid] > 0: 
                W_matrix[pos][amino_acid] = 2 * math.log(p_matrix[pos][amino_acid] / background_freq[amino_acid]) / math.log(2)

            else:
                W_matrix[pos][amino_acid] = -999.9

    return W_matrix

W_matrix = log_odds_weight(peptide_length, amino_acids, p_matrix, background_freq)

pprint(W_matrix[0])

{'A': 0.8919657302038987,
 'C': -0.6984293946039517,
 'D': -1.3721816531573625,
 'E': -0.8443685774100588,
 'F': 0.627072147100672,
 'G': -0.23223385714680514,
 'H': -0.7111960003754546,
 'I': 0.2124283604386675,
 'K': 0.03867137317999952,
 'L': 0.09225922061967673,
 'M': 0.5677162517897559,
 'N': -1.3506537268042555,
 'P': -0.917824331009928,
 'Q': -0.6074905829292724,
 'R': 0.020758696079847507,
 'S': 0.6589789008068409,
 'T': -0.20371620971979315,
 'V': 0.5057446065687232,
 'W': -0.6204100472027255,
 'Y': 0.7132732216574992}


## Scoring peptides

In [74]:
def scoring_peptide(peptides, matrix):

    peptide_scores = dict()

    for peptide in peptides:
        score = 0
        for i, amino_acid in enumerate(peptide):
            score += matrix[i][amino_acid]
    
        peptide_scores[peptide] = score

    return peptide_scores

peptide_scores = scoring_peptide(peptides, W_matrix)

## Write to outfile

In [75]:
output_file = './training_peptide_PSSM_scores.txt'

output = open(output_file, 'w')
for peptide, score in peptide_scores.items():
    output.write(peptide + " " + str(score) + "\n")

output.close()

In [76]:
output_file = './evaluation_peptide_PSSM_scores.txt'
infile_path = '../data/data/PSSM/A0201.eval'

evaluation_data = np.loadtxt(infile_path, dtype=str).tolist()
evaluation_peptides = [x[0] for x in evaluation_data]

evaluation_peptide_scores = scoring_peptide(evaluation_peptides, W_matrix)

output = open(output_file, 'w')

for peptide, score in evaluation_peptide_scores.items():
    output.write(peptide + " " + str(score) + "\n")

output.close()