**TO DO**

-> Implement viterbi algo in HMM Class to get most likely sequence

-> Implement tiny probabilites for absent proteins?

In [104]:
import pandas as pd
import numpy as np

In [105]:
# Important Global Values
proteinCodes = ["A", "R", "N", "D", "C", "E", "Q", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "-"]

In [106]:
# This class lets us store an HMM
# It contains HMMState objects that make up the  HMM
# We can generate samples from it and find the most likely sequence
class profileHMM:
  # Initializes the HMM
  def __init__(self):
    self.mStates = [] # List of match states
    self.iState = None # The global inserton state
    self.iSizes = None # Distribution of insertion sizes

  # Adds match states to the existing HMM
  def add_mState(self, mHMMState):
    self.mStates.append(mHMMState)

  # ----- For Now -----
  # Sets the global insertion state to the HMM
  # We use a global insertion state because it
  # simplifies insertions - temporary?
  def set_insertionInfo(self, iHMMState, iSizes):
    self.iState = iHMMState
    self.iSizes = iSizes

  # Randomly generates a sequence from the existing HMM
  def sample(self):
    out_seq = ""
    for mState in self.mStates:
      step = mState.forward()
      out_seq = out_seq + step[0]
      while step[1] == "Insert":
        insertionAmount = np.random.choice(self.iSizes)
        while insertionAmount > 0:
          step = self.iState.forward()
          out_seq = out_seq + step[0]
          insertionAmount = insertionAmount - 1
    return out_seq[0]

  # Gets the most likely sequence - uses viterbi algorithm
  def mostLikely(self):
    ...


In [107]:
# This class represents our HMM States that make up our HMM
class HMMState:
  def __init__(self, sProbs, tProbs):
    # The state matrix, contains the code probabilities
    self.sMat = pd.DataFrame({"Code": proteinCodes,
                              "Prob": sProbs})

    # The transition matrix, contains transition probabilites
    self.tMat = pd.DataFrame({"State": ["Match", "Insert"], "Prob": tProbs})

  # Does a forward step through this state
  # Randomly selects a code based on eProbs
  # Randomly selects a next state type based on tProbs
  def forward(self):
    code = np.random.choice(a = self.sMat["Code"], size = 1, p = self.sMat["Prob"])
    transition = np.random.choice(a = self.tMat["State"], size = 1, p = self.tMat["Prob"])
    return [code, transition]

In [108]:
def buildHMM(seqDF):

  # The HMM we output
  HMM = profileHMM()

  # ----- Column Labelling -----

  # First we need to go through each column and determine
  # whether or not it was the result of an insertion.
  # In order to do this we look at the number of gaps "-"
  # in the column.

  # If more than half of the sequences contain a gap
  # we then assume those without had an insertion.

  # Conversely if less than half of the sequences have a gap, we asssume
  # a regular state, and those that have the gap had a deletion event.
  ncol = seqDF.shape[1]
  nseq = seqDF.shape[0]

  colStates = [] # Labels each column as either match or insertion states

  for col in range(ncol):
    colDat = seqDF.iloc[:,col]
    ngaps = np.array(colDat == "-").sum()
    if ngaps > (nseq / 2): # More than half gaps.
      colStates.append("I")
    else:
      colStates.append("M")


  # ----- Calculating Global Insertion Matrix -----

  # We need to first look at the sizes of each insertion
  # Since it is more likely that an insertion was of length 2
  # than to have been back to back insertions we look at insertion length

  insertionSizes = [] # Stores the sizes of each insertion

  # We need to look at each sequence individually
  for seq in range(nseq):
    curSeq = seqDF.iloc[seq,:] # Gets the current sequence
    curInsSize = 0 # This tracks the size of the current insertion

    for col in range(ncol):
      # This increments the insertion size if it is following another insertion
      # again this is because it is more likely for a larger insertion than
      # back to back insertions.
      if colStates[col] == 'I' and curSeq[col] != "-":
        curInsSize = curInsSize + 1

      else:
        # If the counter is non-zero and the insertion ends we track
        # the size of the insertion and reset the tracker to 0
        if curInsSize != 0:
          insertionSizes.append(curInsSize)
          curInsSize = 0


  # We calculate the default insertion probability as the
  # number of insertions divided by the number of places
  # an insertion could occur (match states across all seq)
  numInsertions = len(insertionSizes)
  insertionSites = sum(1 for colState in colStates if colState == "M")
  defaultInsertionProb = numInsertions / (insertionSites * nseq)

  # Now we need to calculate insertion probabilities
  # for each protein within the insertions
  insertionDF = seqDF.iloc[:, [col for col in range(ncol) if colStates[col] == "I"]]
  insertionValues = insertionDF.values.flatten().tolist()
  insertionValues = [value for value in insertionValues if value != "-"]
  insertion_sProbs = []
  for code in proteinCodes:
    insertion_sProbs.append(sum(1 for value in insertionValues if value == code) / len(insertionValues))

  # Insertion State Transition Probabilities, use the default
  # Insertion probability because there is no location specific data
  insertion_tProbs = [1-defaultInsertionProb, defaultInsertionProb]

  # Creating the Global Insertion State
  insertionState = HMMState(insertion_sProbs, insertion_tProbs)

  # Creating the HMM's Insertion Info
  HMM.set_insertionInfo(insertionState, insertionSizes)


  # ----- Calculating the Match States -----
  # Look at each column
  for col in range(ncol):
    colState = colStates[col]
    # Check if the column is a match or insertion column
    # skip the column if its an insertion column. If it is
    # a match column we instead look at the probabilities for each protein
    if colState == "M":
      # Gets the protein codes from the current column
      curCol = seqDF.iloc[:,col]
      sProbs = []
      # Gets the probabilities for each protein (including deletion)
      for code in proteinCodes:
        sProbs.append(sum(1 for value in curCol if value == code) / nseq)

      # Checks to make sure it isnt the last column before
      # trying to check the next. We want to check the next
      # column to see if it is an insertion state.
      tProbs = [1-defaultInsertionProb, defaultInsertionProb]
      if col < (ncol - 1):
        colStateNext = colStates[col + 1]
        if colStateNext == "I":
          nextCol = seqDF.iloc[:, col + 1]
          iProb = (len([value for value in nextCol if value != "-"]) / nseq)
          tProbs = [1 - iProb, iProb]

      # Creates an mState corresponding to the current match column
      mState = HMMState(sProbs, tProbs)
      # Adds it to the model
      HMM.add_mState(mState)

  return HMM

In [109]:
# Sample Sequences
kinAseq = "VAVKTLKDELG--LFRIVN---FPNVGDLKPQKLSDFGLAIEAAK"
kinBseq = "VVIRTLKDELG--VFKLVH---YPNVDDLKPQKLSDFGLSVESVK"
kinCseq = "IAVKALKGDLG--LFRVTDG-LHPGVDLKPQKLSDFGFAIEAVR-"
kinDseq = "VVIKALKDELG--LYKVVN---HPNVDVLKPQKLSDFGLAIEAIR"
kinEseq = "LALKVLKDELGNVLFRVVNEE-YPNVNDLKPQKLSDFGLAIEAAR"
kinFseq = "-VLRNLRDDLG--LYKIVN---HPNVEELRPQKLSDFGIAIDAVK"

# Combining them into an iterable dataframe
seqs = [kinAseq, kinBseq, kinCseq, kinDseq, kinEseq, kinFseq]
seqDF = pd.DataFrame(list(seq) for seq in seqs)

# Creating the HMM
myHMM = buildHMM(seqDF)

# Sampling it
myHMM.sample()

'VVIRALRDELGLYRIVDYPNVNDLKPKKSSDFGLAEAVAK'