In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
from smod_wrapper import SMoDWrapper
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import roc_auc_score

In [15]:
import random
def random_string(length,alphabet_list):
    rand_str = ''.join(random.choice(alphabet_list) for i in range(length))
    return rand_str

def perturb(seed,alphabet_list,p=0.5):
    seq=''
    for c in seed:
        if random.random() < p: c = random.choice(alphabet_list)
        seq += c
    return seq

def make_artificial_dataset(alphabet='ACGT', motives=None, motif_length=6, 
                            sequence_length=100, n_sequences=1000, n_motives=2, p=0.2,
                           random_state=1):
    random.seed(random_state)
    alphabet_list=[c for c in alphabet]
    
    if motives is None:
        motives=[]
        for i in range(n_motives):
            motives.append(random_string(motif_length,alphabet_list))
    else:
        motif_length = len(motives[0])
        n_motives = len(motives)
    
    sequence_length = sequence_length / len(motives)
    flanking_length = (sequence_length - motif_length ) / 2
    n_seq_per_motif = n_sequences

    counter=0
    seqs=[]
    for i in range(n_seq_per_motif):
        total_seq = ''
        total_binary_seq=''
        for j in range(n_motives):
            left_flanking = random_string(flanking_length,alphabet_list)
            right_flanking = random_string(flanking_length,alphabet_list)
            noisy_motif = perturb(motives[j],alphabet_list,p)
            seq = left_flanking + noisy_motif + right_flanking
            total_seq += seq
        seqs.append(('ID%d'%counter,total_seq))
        counter += 1
    binary_skeleton = '0' * flanking_length + '1' * motif_length + '0' * flanking_length
    binary_seq = binary_skeleton * n_motives
    return motives, seqs, binary_seq

In [16]:
def score_seqs(seqs, n_motives, tool):
    scores = []
    if tool is None:
        return scores
    
    for j in range(len(seqs)):
        seq_scr = []
        iters = tool.nmotifs
        for k in range(iters):
            scr=tool.score(motif_num=k+1, seq=seqs[j][1])
            seq_scr.append(scr)

        # taking average over all motives for a sequence
        if len(seq_scr) > 1:
            x = np.array(seq_scr[0])
            for l in range(1, iters):
                x = np.vstack((x, seq_scr[l]))
            seq_scr = list(np.mean(x, axis=0))
            scores.append(seq_scr)
        elif len(seq_scr) == 1:
            scores.append(np.array(seq_scr[0]))
        else:
            raise ValueError("no sequence score")
    return scores

In [17]:
def random_setting():
    values = [x/100.0 for x in range(1,101)]

    # possible values for every parameter
    parameters = {'min_freq': values[:],
                  'min_cluster_size': range(1,11),
                  'p_value': [x/10.0 for x in range(1,11)],
                  'similarity_th': values[:],
                  'min_score': range(1,11),
                  'regex_th': values[:],
                  'freq_th': values[:],
                  'std_th': values[:]}
    setting = {}
    
    for i in parameters.keys():
        x = random.choice(parameters[i])
        setting[i] = x
    return setting

In [27]:
def get_dataset(sequence_length=200,
                n_sequences=200,
                motif_length=10,
                n_motives=2, 
                p=0.2):
    
    motives, pos_seqs, binary_seq = make_artificial_dataset(alphabet='ACGT',
                                                            sequence_length=sequence_length,
                                                            n_sequences=n_sequences,
                                                            motif_length=motif_length,
                                                            n_motives=n_motives,
                                                            p=p)

    from eden.modifier.seq import seq_to_seq, shuffle_modifier
    neg_seqs = seq_to_seq(pos_seqs, modifier=shuffle_modifier, times=1, order=2)
    neg_seqs = list(neg_seqs)

    block_size=n_sequences/8

    pos_size = len(pos_seqs)
    train_pos_seqs = pos_seqs[:pos_size/2]
    test_pos_seqs = pos_seqs[pos_size/2:]

    neg_size = len(neg_seqs)
    train_neg_seqs = neg_seqs[:neg_size/2]
    test_neg_seqs = neg_seqs[neg_size/2:]

    true_score = [float(int(i)) for i in binary_seq]
    return block_size, pos_seqs, neg_seqs, test_pos_seqs, n_motives, true_score

In [30]:

# Initial Setting for every run
max_roc = 0.5
best_config = {'min_score':4,
               'min_freq':0.5,
               'min_cluster_size':10,
               'p_value':0.05,
               'similarity_th':0.5,
               'regex_th':0.3,
               'freq_th':None,
               'std_th':None}
param = [x/10.0 for x in range(1, 10)]
values = [best_config for v in range(len(param))]

results_dic = dict(zip(param, values))

reps = 20

for i in param:
    for j in range(reps):
        max_roc = 0.5
        # Randomize Parameter setting
        param_setting = random_setting()
        
        dataset_score = []
        
        n_sets = 5    # try each setting on 5 different data sets
        
        for k in range(n_sets):
            # Generate data set
            block_size, pos_seqs, neg_seqs, test_pos_seqs, n_motives, true_score = get_dataset(sequence_length=300,
                                     n_sequences=600,                                                                                   
                                     motif_length=10,                                  
                                     n_motives=2,                                              
                                     p=i)

            smod = SMoDWrapper(alphabet = 'dna',
                               scoring_criteria = 'pwm',

                               complexity = 5,
                               n_clusters = 10,
                               min_subarray_size = 8,
                               max_subarray_size = 12,
                               clusterer = KMeans(),
                               pos_block_size = block_size,
                               neg_block_size = block_size,
                               sample_size = 300,
                               p_value = param_setting['p_value'],
                               similarity_th = param_setting['similarity_th'],
                               min_score = param_setting['min_score'],
                               min_freq = param_setting['min_freq'],
                               min_cluster_size = param_setting['min_cluster_size'],
                               regex_th = param_setting['regex_th'],
                               freq_th = param_setting['freq_th'],
                               std_th = param_setting['std_th']) 

            smod.fit(pos_seqs, neg_seqs)

            try:
                scores = score_seqs(seqs = test_pos_seqs,
                                    n_motives = n_motives,
                                    tool = smod)
            except:
                continue
                
            mean_score = np.mean(scores, axis=0)
            roc_score = roc_auc_score(true_score, mean_score)
            dataset_score.append(roc_score)
        setting_avg = np.mean(dataset_score)
    if setting_avg > max_roc:
        print "Better Configuration found at perturbation prob = ", i
        print "ROC: ", setting_avg
        print "Parameter Configuration: ", param_setting
        print
        results_dic[i] = (param_setting, setting_avg)
print
print
print "***Final Result***"
print results_dic

  return 1 / (1 + np.exp(-(x - a) / b))


Better Configuration found at perturbation prob =  0.1
ROC:  0.793035714286
Parameter Configuration:  {'min_freq': 0.89, 'min_score': 10, 'p_value': 0.6, 'min_cluster_size': 3, 'std_th': 0.59, 'regex_th': 0.92, 'similarity_th': 0.72, 'freq_th': 0.28}





Better Configuration found at perturbation prob =  0.2
ROC:  0.742857142857
Parameter Configuration:  {'min_freq': 0.88, 'min_score': 8, 'p_value': 0.4, 'min_cluster_size': 6, 'std_th': 0.36, 'regex_th': 0.68, 'similarity_th': 0.07, 'freq_th': 0.9}





ValueError: no sequence score