In [1]:
import numpy as np
from scipy.stats import itemfreq

In [2]:
s = np.array([11,12,12,11,4,11,12,5,5,12])
print(s)

[11 12 12 11  4 11 12  5  5 12]


In [3]:
#2018.12.24: replace value at a column with probility of elements in that column
def value_with_prob(a,p):
    """ generate a value (in a) with probability
    input: a = np.array(['A','B','C','D']) and p = np.array([0.4,0.5,0.05,0.05]) 
    output: B or A (likely), C or D (unlikely)
    """
    # if no-specific prob --> set as uniform distribution
    if p.sum() == 0:
        p[:] = 1./a.shape[0] # uniform
    else:
        p[:] /= p.sum() # normalize

    ia = int((p.cumsum() < np.random.rand()).sum()) # cordinate

    return a[ia]

In [4]:
def replace_lower_by_higher_prob(s,p0=0.3):
    # input: numpy array, 1 dimension s ; threshold p0
    # output: s in which element having p < p0 were placed by elements with p > p0, according to prob

    f = itemfreq(s)

    # element and number of occurence
    a,p = f[:,0],f[:,1]

    # probabilities
    p = p/float(p.sum()) 

    
    # find elements having p > p0:
    iapmax = np.argwhere(p>p0).reshape((-1,))  # position
    apmax = a[iapmax].reshape((-1,))           # name of aminoacid
    pmax = p[iapmax].reshape((-1,))            # probability
    
    # find elements having p < p0
    apmin = a[np.argwhere(p < p0)].reshape((-1,))

    for a in apmin:
        ia = np.argwhere(s==a).reshape((-1,))
        for iia in ia:
            s[iia] = value_with_prob(apmax,pmax)
        
    return s

In [5]:
s = replace_lower_by_higher_prob(s,p0=0.3)
print(s)

[11 12 12 11 12 11 12 12 12 12]
