## Task 1: generate simulated data

We are going to generate sequences with motif: ATATTCA and GTACTGC.

They will either appear together with a distance of 10-fold from each other: 10, 20, 30, .... nucleotides apart.

Or only one of them will appear.

And all the other part of the sequences will be randomly generated.

And then we will add "-" tokens for the deletion.

In [2]:
import random
from dataclasses import dataclass
from typing import List, Optional, Tuple

DNA = ["A", "C", "G", "T"]

MOTIF_A = "ATATTCA"
MOTIF_B = "GTACTGC"


def rand_dna(n: int, rng: random.Random) -> str:
    return "".join(rng.choice(DNA) for _ in range(n))


@dataclass
class Sequence:
    seq: str
    label: str               # "both", "A_only", "B_only", "no_motif"
    pos_a: Optional[int]     # start index or None
    pos_b: Optional[int]     # start index or None
    gap: Optional[int]       # nt between motifs if both else None


def make_example(
    length: int,
    mode: str,                     # "both" | "A_only" | "B_only" | "no_motif" 
    gaps: List[int],               # e.g. [10,20,30,...]
    rng: random.Random
) -> Sequence:
    if length < max(len(MOTIF_A), len(MOTIF_B)) + 1:
        raise ValueError("Sequence length too short for motifs.")
    
    # we don't allow gap not divisible by 10
    invalid = [g for g in gaps if g % 10 != 0]
    if invalid:
        raise ValueError(
            f"Invalid gaps detected: {invalid}. "
            "All gaps must be divisible by 10 (e.g. 10, 20, 30, ...)."
        )
    
    if mode == "both":
        gap = rng.choice(gaps)
        total_motif_len = len(MOTIF_A) + gap + len(MOTIF_B)
        if total_motif_len > length:
            raise ValueError(
                f"Length {length} too short for both motifs with gap {gap} "
                f"(need >= {total_motif_len})."
            )


        start = rng.randint(0, length - total_motif_len)
        prefix_len = start
        suffix_len = length - prefix_len - total_motif_len

        prefix = rand_dna(prefix_len, rng)
        between = rand_dna(gap, rng)
        suffix = rand_dna(suffix_len, rng)


        seq = prefix + MOTIF_A + between + MOTIF_B + suffix
        pos_a = prefix_len
        pos_b = prefix_len + len(MOTIF_A) + gap


        return Sequence(seq=seq, label="both", pos_a=pos_a, pos_b=pos_b, gap=gap)

    elif mode == "A_only":
        total = len(MOTIF_A)
        start = rng.randint(0, length - total)
        seq = rand_dna(start, rng) + MOTIF_A + rand_dna(length - start - total, rng)
        return Sequence(seq=seq, label="A_only", pos_a=start, pos_b=None, gap=None)

    elif mode == "B_only":
        total = len(MOTIF_B)
        start = rng.randint(0, length - total)
        seq = rand_dna(start, rng) + MOTIF_B + rand_dna(length - start - total, rng)
        return Sequence(seq=seq, label="B_only", pos_a=None, pos_b=start, gap=None)

    elif mode == "no_motif":
        seq = rand_dna(length, rng)
        return Sequence(seq=seq, label="no_motif", pos_a=None, pos_b=None, gap=None)
    
    else:
        raise ValueError("mode must be: 'both', 'A_only', or 'B_only'")


def generate_dataset(
    n: int, # numbers of seuqneces we want to generate
    length: int = 120, # lenght of each sequence, should all be the same
    gaps: List[int] = None, # list of possible gaps between motif A and motif B: 10, 20, 30, ..., 100
    p_both: float = 0.4, # how many sequences contain both motifs in prbobability
    p_a_only: float = 0.25,  # how many sequences contain only motif A in probability
    p_b_only: float = 0.25,  # how many sequences contain only motif B in probability
    p_no_motif: float = 0.1, # how many sequences contain no motif in probability
    seed: int = 727 # random seed
) -> List[Sequence]:
    
    if gaps is None:
        gaps = list(range(10, 101, 10))  # 10,20,...,100

    if abs((p_both + p_a_only + p_b_only + p_no_motif) - 1.0) > 1e-9:
        raise ValueError("Probabilities must sum to 1 (p_both + p_a_only + p_b_only).")

    rng = random.Random(seed)

    data = []
    for _ in range(n):
        r = rng.random()
        # r is in range [0,1), so we can use it to randomly select mode, given the probabilities
        if r < p_both:
            mode = "both"
        elif r < p_both + p_a_only:
            mode = "A_only"
        elif r < p_both + p_a_only + p_b_only:
            mode = "B_only"
        else:
            mode = "no_motif"

        ex = make_example(length=length, mode=mode, gaps=gaps, rng=rng)
        data.append(ex)

    return data


n=5
length=120
dataset = generate_dataset(n=n, length=length)

# output the sequences in FASTA format
def write_fasta(dataset, filepath):
    with open(filepath, "w") as f:
        for i, ex in enumerate(dataset, start=1):
            header = (
                f">seq{i:04d}"
                f"|label={ex.label}"
                f"|posAmotif={ex.pos_a}"
                f"|posBmotif={ex.pos_b}"
                f"|gaplength={ex.gap}"
            )
            f.write(header + "\n")
            f.write(ex.seq + "\n")
            

write_fasta(dataset, f"simulated_sequences/sequence_size{n}_length{length}.fasta")

## Data argumentation
