-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #979 from rbharath/dragonn
Adding Dragonn Example to contrib/
- Loading branch information
Showing
26 changed files
with
2,185 additions
and
1 deletion.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,367 @@ | ||
from __future__ import absolute_import, division, print_function | ||
from collections import OrderedDict | ||
import numpy as np | ||
import simdna | ||
from simdna.synthetic import ( | ||
RepeatedEmbedder, SubstringEmbedder, ReverseComplementWrapper, | ||
UniformPositionGenerator, InsideCentralBp, | ||
LoadedEncodeMotifs, PwmSamplerFromLoadedMotifs, | ||
UniformIntegerGenerator, ZeroOrderBackgroundGenerator, | ||
EmbedInABackground, GenerateSequenceNTimes, | ||
RandomSubsetOfEmbedders, IsInTraceLabelGenerator, | ||
EmbeddableEmbedder, PairEmbeddableGenerator, | ||
) | ||
from simdna.util import DiscreteDistribution | ||
|
||
loaded_motifs = LoadedEncodeMotifs(simdna.ENCODE_MOTIFS_PATH, | ||
pseudocountProb=0.001) | ||
|
||
|
||
def get_distribution(GC_fraction): | ||
return DiscreteDistribution({ | ||
'A': (1 - GC_fraction) / 2, 'C': GC_fraction / 2, | ||
'G': GC_fraction / 2, 'T': (1 - GC_fraction) / 2}) | ||
|
||
|
||
def simple_motif_embedding(motif_name, seq_length, num_seqs, GC_fraction): | ||
""" | ||
Simulates sequences with a motif embedded anywhere in the sequence. | ||
Parameters | ||
---------- | ||
motif_name : str | ||
encode motif name | ||
seq_length : int | ||
length of sequence | ||
num_seqs: int | ||
number of sequences | ||
GC_fraction : float | ||
GC fraction in background sequence | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Array with sequence strings. | ||
embedding_arr: 1darray | ||
Array of embedding objects. | ||
""" | ||
if motif_name is None: | ||
embedders = [] | ||
else: | ||
substring_generator = PwmSamplerFromLoadedMotifs( | ||
loaded_motifs, motif_name) | ||
embedders = [SubstringEmbedder( | ||
ReverseComplementWrapper(substring_generator))] | ||
embed_in_background = EmbedInABackground( | ||
ZeroOrderBackgroundGenerator( | ||
seq_length, discreteDistribution=get_distribution(GC_fraction)), | ||
embedders) | ||
generated_sequences = tuple(GenerateSequenceNTimes( | ||
embed_in_background, num_seqs).generateSequences()) | ||
sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences]) | ||
embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences] | ||
return sequence_arr, embedding_arr | ||
|
||
|
||
def motif_density(motif_name, seq_length, num_seqs, | ||
min_counts, max_counts, GC_fraction, | ||
central_bp=None): | ||
""" | ||
returns sequences with motif density, along with embeddings array. | ||
""" | ||
substring_generator = PwmSamplerFromLoadedMotifs(loaded_motifs, motif_name) | ||
if central_bp is not None: | ||
position_generator = InsideCentralBp(central_bp) | ||
else: | ||
position_generator = UniformPositionGenerator() | ||
quantity_generator = UniformIntegerGenerator(min_counts, max_counts) | ||
embedders = [ | ||
RepeatedEmbedder( | ||
SubstringEmbedder( | ||
ReverseComplementWrapper( | ||
substring_generator), position_generator), | ||
quantity_generator)] | ||
embed_in_background = EmbedInABackground( | ||
ZeroOrderBackgroundGenerator( | ||
seq_length, discreteDistribution=get_distribution(GC_fraction)), | ||
embedders) | ||
generated_sequences = tuple(GenerateSequenceNTimes( | ||
embed_in_background, num_seqs).generateSequences()) | ||
sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences]) | ||
embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences] | ||
return sequence_arr, embedding_arr | ||
|
||
|
||
def simulate_single_motif_detection(motif_name, seq_length, | ||
num_pos, num_neg, GC_fraction): | ||
""" | ||
Simulates two classes of seqeuences: | ||
- Positive class sequence with a motif | ||
embedded anywhere in the sequence | ||
- Negative class sequence without the motif | ||
Parameters | ||
---------- | ||
motif_name : str | ||
encode motif name | ||
seq_length : int | ||
length of sequence | ||
num_pos : int | ||
number of positive class sequences | ||
num_neg : int | ||
number of negative class sequences | ||
GC_fraction : float | ||
GC fraction in background sequence | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Array with sequence strings. | ||
y : 1darray | ||
Array with positive/negative class labels. | ||
embedding_arr: 1darray | ||
Array of embedding objects. | ||
""" | ||
motif_sequence_arr, positive_embedding_arr = simple_motif_embedding( | ||
motif_name, seq_length, num_pos, GC_fraction) | ||
random_sequence_arr, negative_embedding_arr = simple_motif_embedding( | ||
None, seq_length, num_neg, GC_fraction) | ||
sequence_arr = np.concatenate((motif_sequence_arr, random_sequence_arr)) | ||
y = np.array([[True]] * num_pos + [[False]] * num_neg) | ||
embedding_arr = positive_embedding_arr + negative_embedding_arr | ||
return sequence_arr, y, embedding_arr | ||
|
||
|
||
def simulate_motif_counting(motif_name, seq_length, pos_counts, neg_counts, | ||
num_pos, num_neg, GC_fraction): | ||
""" | ||
Generates data for motif counting task. | ||
Parameters | ||
---------- | ||
motif_name : str | ||
seq_length : int | ||
pos_counts : list | ||
(min_counts, max_counts) for positive set. | ||
neg_counts : list | ||
(min_counts, max_counts) for negative set. | ||
num_pos : int | ||
num_neg : int | ||
GC_fraction : float | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Contains sequence strings. | ||
y : 1darray | ||
Contains labels. | ||
embedding_arr: 1darray | ||
Array of embedding objects. | ||
""" | ||
pos_count_sequence_array, positive_embedding_arr = motif_density( | ||
motif_name, seq_length, num_pos, | ||
pos_counts[0], pos_counts[1], GC_fraction) | ||
neg_count_sequence_array, negative_embedding_arr = motif_density( | ||
motif_name, seq_length, num_pos, | ||
neg_counts[0], neg_counts[1], GC_fraction) | ||
sequence_arr = np.concatenate( | ||
(pos_count_sequence_array, neg_count_sequence_array)) | ||
y = np.array([[True]] * num_pos + [[False]] * num_neg) | ||
embedding_arr = positive_embedding_arr + negative_embedding_arr | ||
return sequence_arr, y, embedding_arr | ||
|
||
|
||
def simulate_motif_density_localization( | ||
motif_name, seq_length, center_size, min_motif_counts, | ||
max_motif_counts, num_pos, num_neg, GC_fraction): | ||
""" | ||
Simulates two classes of seqeuences: | ||
- Positive class sequences with multiple motif instances | ||
in center of the sequence. | ||
- Negative class sequences with multiple motif instances | ||
anywhere in the sequence. | ||
The number of motif instances is uniformly sampled | ||
between minimum and maximum motif counts. | ||
Parameters | ||
---------- | ||
motif_name : str | ||
encode motif name | ||
seq_length : int | ||
length of sequence | ||
center_size : int | ||
length of central part of the sequence where motifs can be positioned | ||
min_motif_counts : int | ||
minimum number of motif instances | ||
max_motif_counts : int | ||
maximum number of motif instances | ||
num_pos : int | ||
number of positive class sequences | ||
num_neg : int | ||
number of negative class sequences | ||
GC_fraction : float | ||
GC fraction in background sequence | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Contains sequence strings. | ||
y : 1darray | ||
Contains labels. | ||
embedding_arr: 1darray | ||
Array of embedding objects. | ||
""" | ||
localized_density_sequence_array, positive_embedding_arr = motif_density( | ||
motif_name, seq_length, num_pos, | ||
min_motif_counts, max_motif_counts, GC_fraction, center_size) | ||
unlocalized_density_sequence_array, negative_embedding_arr = motif_density( | ||
motif_name, seq_length, num_neg, | ||
min_motif_counts, max_motif_counts, GC_fraction) | ||
sequence_arr = np.concatenate( | ||
(localized_density_sequence_array, unlocalized_density_sequence_array)) | ||
y = np.array([[True]] * num_pos + [[False]] * num_neg) | ||
embedding_arr = positive_embedding_arr + negative_embedding_arr | ||
return sequence_arr, y, embedding_arr | ||
|
||
|
||
def simulate_multi_motif_embedding(motif_names, seq_length, min_num_motifs, | ||
max_num_motifs, num_seqs, GC_fraction): | ||
""" | ||
Generates data for multi motif recognition task. | ||
Parameters | ||
---------- | ||
motif_names : list | ||
List of strings. | ||
seq_length : int | ||
min_num_motifs : int | ||
max_num_motifs : int | ||
num_seqs : int | ||
GC_fraction : float | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Contains sequence strings. | ||
y : ndarray | ||
Contains labels for each motif. | ||
embedding_arr: 1darray | ||
Array of embedding objects. | ||
""" | ||
|
||
def get_embedder(motif_name): | ||
substring_generator = PwmSamplerFromLoadedMotifs( | ||
loaded_motifs, motif_name) | ||
return SubstringEmbedder( | ||
ReverseComplementWrapper(substring_generator), | ||
name=motif_name) | ||
|
||
embedders = [get_embedder(motif_name) for motif_name in motif_names] | ||
quantity_generator = UniformIntegerGenerator( | ||
min_num_motifs, max_num_motifs) | ||
combined_embedder = [RandomSubsetOfEmbedders( | ||
quantity_generator, embedders)] | ||
embed_in_background = EmbedInABackground( | ||
ZeroOrderBackgroundGenerator( | ||
seq_length, discreteDistribution=get_distribution(GC_fraction)), | ||
combined_embedder) | ||
generated_sequences = tuple(GenerateSequenceNTimes( | ||
embed_in_background, num_seqs).generateSequences()) | ||
sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences]) | ||
label_generator = IsInTraceLabelGenerator(np.asarray(motif_names)) | ||
y = np.array([label_generator.generateLabels(generated_seq) | ||
for generated_seq in generated_sequences], dtype=bool) | ||
embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences] | ||
return sequence_arr, y, embedding_arr | ||
|
||
|
||
def simulate_differential_accessibility( | ||
pos_motif_names, neg_motif_names, seq_length, | ||
min_num_motifs, max_num_motifs, num_pos, num_neg, GC_fraction): | ||
""" | ||
Generates data for differential accessibility task. | ||
Parameters | ||
---------- | ||
pos_motif_names : list | ||
List of strings. | ||
neg_motif_names : list | ||
List of strings. | ||
seq_length : int | ||
min_num_motifs : int | ||
max_num_motifs : int | ||
num_pos : int | ||
num_neg : int | ||
GC_fraction : float | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Contains sequence strings. | ||
y : 1darray | ||
Contains labels. | ||
embedding_arr: 1darray | ||
Array of embedding objects. | ||
""" | ||
pos_motif_sequence_arr, _, positive_embedding_arr = simulate_multi_motif_embedding( | ||
pos_motif_names, seq_length, | ||
min_num_motifs, max_num_motifs, num_pos, GC_fraction) | ||
neg_motif_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding( | ||
neg_motif_names, seq_length, | ||
min_num_motifs, max_num_motifs, num_neg, GC_fraction) | ||
sequence_arr = np.concatenate( | ||
(pos_motif_sequence_arr, neg_motif_sequence_arr)) | ||
y = np.array([[True]] * num_pos + [[False]] * num_neg) | ||
embedding_arr = positive_embedding_arr + negative_embedding_arr | ||
return sequence_arr, y, embedding_arr | ||
|
||
|
||
def simulate_heterodimer_grammar( | ||
motif1, motif2, seq_length, | ||
min_spacing, max_spacing, num_pos, num_neg, GC_fraction): | ||
""" | ||
Simulates two classes of sequences with motif1 and motif2: | ||
- Positive class sequences with motif1 and motif2 positioned | ||
min_spacing and max_spacing | ||
- Negative class sequences with independent motif1 and motif2 positioned | ||
anywhere in the sequence, not as a heterodimer grammar | ||
Parameters | ||
---------- | ||
seq_length : int, length of sequence | ||
GC_fraction : float, GC fraction in background sequence | ||
num_pos : int, number of positive class sequences | ||
num_neg : int, number of negatice class sequences | ||
motif1 : str, encode motif name | ||
motif2 : str, encode motif name | ||
min_spacing : int, minimum inter motif spacing | ||
max_spacing : int, maximum inter motif spacing | ||
Returns | ||
------- | ||
sequence_arr : 1darray | ||
Array with sequence strings. | ||
y : 1darray | ||
Array with positive/negative class labels. | ||
embedding_arr: list | ||
List of embedding objects. | ||
""" | ||
|
||
motif1_generator = ReverseComplementWrapper(PwmSamplerFromLoadedMotifs(loaded_motifs, motif1)) | ||
motif2_generator = ReverseComplementWrapper(PwmSamplerFromLoadedMotifs(loaded_motifs, motif2)) | ||
separation_generator = UniformIntegerGenerator(min_spacing, max_spacing) | ||
embedder = EmbeddableEmbedder(PairEmbeddableGenerator( | ||
motif1_generator, motif2_generator, separation_generator)) | ||
embed_in_background = EmbedInABackground(ZeroOrderBackgroundGenerator( | ||
seq_length, discreteDistribution=get_distribution(GC_fraction)), [embedder]) | ||
generated_sequences = tuple(GenerateSequenceNTimes( | ||
embed_in_background, num_pos).generateSequences()) | ||
grammar_sequence_arr = np.array([generated_seq.seq for generated_seq in generated_sequences]) | ||
positive_embedding_arr = [generated_seq.embeddings for generated_seq in generated_sequences] | ||
nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding( | ||
[motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction) | ||
sequence_arr = np.concatenate( | ||
(grammar_sequence_arr, nongrammar_sequence_arr)) | ||
y = np.array([[True]] * num_pos + [[False]] * num_neg) | ||
embedding_arr = positive_embedding_arr + negative_embedding_arr | ||
return sequence_arr, y, embedding_arr |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+71.7 KB
contrib/dragonn/tutorial_images/homotypic_motif_density_localization.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+39.2 KB
contrib/dragonn/tutorial_images/homotypic_motif_density_localization_task.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.