In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from smod_wrapper import SMoDWrapper
from sklearn.cluster import KMeans

In [2]:
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [10]:
def score_seqs(seqs, n_motives, tool):
    scores = []
    if tool is None:
        return scores
    
    for j in range(len(seqs)):
        seq_scr = []
        iters = tool.nmotifs
        for k in range(iters):
            scr=tool.score(motif_num=k+1, seq=seqs[j][1])
            seq_scr.append(scr)

        # taking average over all motives for a sequence
        if len(seq_scr) > 1:
            x = np.array(seq_scr[0])
            for l in range(1, iters):
                x = np.vstack((x, seq_scr[l]))
            seq_scr = list(np.mean(x, axis=0))
            scores.append(seq_scr)
        elif len(seq_scr) == 1:
            scores.append(np.array(seq_scr[0]))
        else:
            raise ValueError("no sequence score")
    return scores

In [8]:
sequence_length=200
n_sequences=300
motif_length=10
n_motives=2
p=0.2

motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                    sequence_length=sequence_length,
                                                    n_sequences=n_sequences,
                                                    motif_length=motif_length,
                                                    n_motives=n_motives,
                                                    p=p)

In [11]:
from eden.modifier.seq import seq_to_seq, shuffle_modifier
neg_seqs = seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=1, order=2)
neg_seqs = list(neg_seqs)

block_size=n_sequences/8

pos_size = len(pos_seqs)
train_pos_seqs = pos_seqs[:pos_size/2]
test_pos_seqs = pos_seqs[pos_size/2:]

neg_size = len(neg_seqs)
train_neg_seqs = neg_seqs[:neg_size/2]
test_neg_seqs = neg_seqs[neg_size/2:]

true_score = [float(int(i)) for i in binary_seq]

In [43]:
values = [x/float(100) for x in range(1,100)]
parameters = {
    'p_value': values[:],
    'similarity_th': values[:],
    'min_score': values[:],
    'min_freq': values[:],
    'min_cluster_size': values[:],
    'regex_th': values[:],
    'freq_th': values[:],
    'std_th': values[:]
}

In [None]:
#complexity=5
#n_clusters=10
#min_subarray_size=4
#max_subarray_size=10
#estimator=SGDClassifier(warm_start=True)
#clusterer=MiniBatchKMeans()
#pos_block_size=300
#neg_block_size=300
#n_jobs=-1
p_value=0.05
similarity_th=0.5
min_score=4
min_freq=0.5,
min_cluster_size=10,
regex_th=0.3,
#sample_size=200,
freq_th=None,
std_th=None,

In [48]:
def param_config():
    setting = {}
    for i in parameters.keys():
        x = random.choice(parameters[i])
        setting[i] = x
    return setting

In [50]:
param_config()

{'freq_th': 0.82,
 'min_cluster_size': 0.71,
 'min_freq': 0.74,
 'min_score': 0.6,
 'p_value': 0.6,
 'regex_th': 0.13,
 'similarity_th': 0.32,
 'std_th': 0.99}