In [1]:
import numpy as np

In [2]:
def value_with_prob(a,p):
    """ generate a value (in a) with probability
    input: a = np.array(['A','B','C','D']) and p = np.array([0.4,0.5,0.05,0.05]) 
    output: B or A (likely), C or D (unlikely)
    """
    # if no-specific prob --> set as uniform distribution
    if p.sum() == 0:
        p[:] = 1./a.shape[0] # uniform
    else:
        p[:] /= p.sum() # normalize

    ia = int((p.cumsum() < np.random.rand()).sum()) # cordinate

    return a[ia]

In [3]:
def find_and_replace(s,z,a):
    """ find posistions of s having z and replace by a with a probality of elements in s column
    input: s = np.array([['A','Q','A'],['A','E','C'],['Z','Q','A'],['A','Z','-']])
           z = 'Z' , a = np.array(['Q','E'])    
    output: s = np.array([['A','Q','A'],['A','E','C'],['E','Q','A'],['A','Q','-']]           
    """  
    z_pos = (s == z)
    xy = np.argwhere(s == z)
    #print(xy)

    for it in range(xy.shape[0]):
        t,i = xy[it,0],xy[it,1]

        na = a.shape[0]
        p = np.zeros(na)    
        for ii in range(na):
            p[ii] = (s[:,i] == a[ii]).sum()

        s[t,i] = value_with_prob(a, p)
    return s                

In [4]:
s = np.array([['X','Q','A'],['B','E','C'],['Z','Q','A'],['A','Z','-']])
print(s)

[['X' 'Q' 'A']
 ['B' 'E' 'C']
 ['Z' 'Q' 'A']
 ['A' 'Z' '-']]


In [5]:
# replace 'Z' by 'Q' or 'E' with prob
s = find_and_replace(s,'Z',np.array(['Q','E']))
print(s)

[['X' 'Q' 'A']
 ['B' 'E' 'C']
 ['E' 'Q' 'A']
 ['A' 'Q' '-']]


In [6]:
# replace 'B' by Asparagine (N) or Aspartic (D)
s = find_and_replace(s,'B',np.array(['N','D']))
print(s)

[['X' 'Q' 'A']
 ['D' 'E' 'C']
 ['E' 'Q' 'A']
 ['A' 'Q' '-']]


In [7]:
# replace x as a amino acids with prob
amino_acids = np.array(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S',\
'T','V','W','Y'])
s = find_and_replace(s,'X',amino_acids)
print(s)

[['E' 'Q' 'A']
 ['D' 'E' 'C']
 ['E' 'Q' 'A']
 ['A' 'Q' '-']]


In [8]:
# replace gap (-) as a amino acids with prob
amino_acids = np.array(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S',\
'T','V','W','Y'])
s = find_and_replace(s,'-',amino_acids)
print(s)

[['E' 'Q' 'A']
 ['D' 'E' 'C']
 ['E' 'Q' 'A']
 ['A' 'Q' 'A']]
