In [9]:
"""Dataset generation module."""
import random


def random_string(length, alphabet_list):
    """Generate a random string."""
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str


def perturb(seed, alphabet_list, p=0.5):
    """Randomize a string."""
    seq = ''
    for c in seed:
        if random.random() < p:
            c = random.choice(alphabet_list)
        seq += c
    return seq

import numpy as np

def inflate_normalize(pwms = None, noise=0.2):
    iters = int((noise * 4) + 1)    # lies in [1, 5]
    print "taking power to: ", iters
    num = pwms.shape[0]
    
    for i in range(iters):
        for j in range(num):    #inflate
            pwms[j] = pwms[j] * pwms[j]
            
        for j in range(num):    # normalize
            pwms[j] = pwms[j] / pwms[j].sum(axis=0)
    return pwms

def get_pwms(alphabet='ACGT', num=2, length=6, noise=0.2):
    
    letters = len(alphabet)
    pwms = []
    for i in range(num):
        i_pwm = np.random.random_sample((letters, length))
        print i, "original random:"
        print i_pwm
        i_pwm = i_pwm / i_pwm.sum(axis=0) # normalize
        print "\n", i, "after 1st normalization:"
        print i_pwm
        pwms.append(i_pwm)
    pwms = np.array(pwms)
    pwms = inflate_normalize(pwms = pwms, noise=noise)
    
    print "\nfinal pwms:"
    print pwms
    return pwms

def motif_from_pwm(alphabet_list, pwm):
    seq = ""
    length = pwm.shape[1]
    
    for i in range(length):
        alphabet_dist = pwm[:, i]
        c = np.random.choice(a=alphabet_list, p=alphabet_dist)
        seq += c
    return seq
    

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6,
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                            random_state=1):
    """Generate artificial dataset.

    Returns: motives - list of motives used in sequence generation
             seq - dataset as list of sequences
             binary_seq - a sequence of 0's & 1's which can be used for computing ROC score.
    """
    random.seed(random_state)
    alphabet_list = [c for c in alphabet]

    """
    if motives is None:
        motives = []
        for i in range(n_motives):
            motives.append(random_string(motif_length, alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    """
    pwms = get_pwms(alphabet=alphabet, num=n_motives, length=motif_length, noise=p)

    sequence_length = sequence_length / n_motives
    flanking_length = (sequence_length - motif_length) / 2
    n_seq_per_motif = n_sequences

    counter = 0
    seqs = []
    for i in range(n_seq_per_motif):
        total_seq = ''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length, alphabet_list)
            right_flanking = random_string(flanking_length, alphabet_list)
            noisy_motif = motif_from_pwm(alphabet_list, pwms[j])
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('ID%d' % counter, total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + \
        '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq


In [11]:
pwms = get_pwms(num=1, length=3, noise=0.3)

0 original random:
[[ 0.9071515   0.75593842  0.41953667]
 [ 0.11727092  0.42874449  0.70510321]
 [ 0.73177672  0.07082941  0.19653261]
 [ 0.49664665  0.38965011  0.55462832]]

0 after 1st normalization:
[[ 0.40266915  0.45949166  0.22365737]
 [ 0.05205457  0.26060922  0.3758945 ]
 [ 0.32482326  0.04305314  0.10477264]
 [ 0.22045302  0.23684598  0.29567549]]
taking power to:  2

final pwms:
[[[  6.60692866e-01   8.51682203e-01   8.27728582e-02]
  [  1.84519219e-04   8.81304749e-02   6.60418164e-01]
  [  2.79765830e-01   6.56427023e-05   3.98608630e-03]
  [  5.93567853e-02   6.01216790e-02   2.52822891e-01]]]


In [10]:
m, s, b = make_artificial_dataset(motif_length=6, n_motives=1, n_sequences=10, p=0.3, sequence_length=12)
print 
for i in s:
    print i[1]
print 
print b

0 original random:
[[ 0.29181386  0.48441822  0.6909489   0.81258829  0.52505557  0.51672961]
 [ 0.48755623  0.8636452   0.9351612   0.78200748  0.53117174  0.07857608]
 [ 0.66030207  0.51307143  0.37174135  0.71211814  0.15390129  0.57209111]
 [ 0.43745153  0.32168884  0.32558801  0.06116526  0.71043728  0.03519718]]

0 after 1st normalization:
[[ 0.15545798  0.22192274  0.29738193  0.34317135  0.27338587  0.42967919]
 [ 0.25973581  0.39565504  0.40249002  0.3302565   0.27657043  0.06533883]
 [ 0.35176268  0.23504942  0.15999614  0.30074091  0.0801333   0.47571426]
 [ 0.23304353  0.1473728   0.14013191  0.02583124  0.3699104   0.02926772]]
taking power to:  2

final pwms:
[[[  2.49642052e-02   7.96424398e-02   2.22785088e-01   4.08561453e-01
     1.84958238e-01   3.99515519e-01]
  [  1.94532857e-01   8.04644633e-01   7.47563892e-01   3.50444130e-01
     1.93728024e-01   2.13619893e-04]
  [  6.54432323e-01   1.00224524e-01   1.86666283e-02   2.40981302e-01
     1.36527962e-03   6.00262