In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
#from genomic_benchmarks.seq2loc import fasta2loc
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import yaml
import gzip
from Bio import SeqIO

def _fastagz2dict(fasta_path, fasta_total=None, stop_id=None, region_name_transform=lambda x: x):
    # load gzipped fasta into dictionary
    fasta = {}

    with gzip.open(fasta_path, "rt") as handle:
        for record in tqdm(SeqIO.parse(handle, "fasta"), total=fasta_total):
            fasta[region_name_transform(record.id)] = str(record.seq)
            if stop_id and (record.id == stop_id):
                # stop, do not read small contigs
                break
    return fasta

In [10]:
from tqdm.autonotebook import tqdm as tdm

def fasta2loc(fasta_path, ref_dict, use_seq_ids=True):

    tree = {}
    nseqs = 0

    # building tree for seq searching
    for seq in SeqIO.parse(open(fasta_path, "r"), "fasta"):
        s = str(seq.seq)
        rev = str(seq.seq.reverse_complement())
        if use_seq_ids:
            sname = seq.name
        else:
            sname = s
        nseqs += 1

        _update_tree(tree, s, sname, "+")
        _update_tree(tree, rev, sname, "-")

    print(f"{nseqs} sequences read and parsed.")

    results = {}

    for chrom in tdm(ref_dict):
        curr_positions = []
        # print(f"Processing chrom {chrom}.")

        for i, c in tdm(enumerate(ref_dict[chrom]), total=len(ref_dict[chrom]), leave=False):

            prev_positions = curr_positions + [tree]
            curr_positions = []

            for pos in prev_positions:
                if c in pos:
                    pos = pos[c]
                    curr_positions.append(pos)

                    if "terminal" in pos:
                        results[pos["terminal"][0]] = (chrom, i - pos["terminal"][2] + 1, i + 1, pos["terminal"][1])

    print(f"{len(results.keys())} sequences found in the reference.")

    return results


def _update_tree(root, seq_str, seq_name, direction):
    # updates tree in `root` with a sequence `seq_str`
    position = root

    for c in seq_str:
        if c in position:
            position = position[c]
        else:
            position[c] = {}
            position = position[c]
    position["terminal"] = (seq_name, direction, len(seq_str))

  from tqdm.autonotebook import tqdm as tdm


## Load genomic reference and download data from GitHub

In [11]:
genome = _fastagz2dict("./Athaliana_167_TAIR9.fa.gz",
                      stop_id='Chr5')
genome.keys()

0it [00:00, ?it/s]

dict_keys(['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5'])

## Get promoters

### A few checks

In [12]:
def gff3_to_dict(gff3_path):
    gff_dict = {}
    with open(gff3_path, "r") as gff3_file:
        for line in gff3_file:
            # Skip comments and metadata lines
            if line.startswith("#"):
                continue
            
            # Parse the line into fields
            parts = line.strip().split("\t")
            if len(parts) < 9:
                continue
            
            chrom, source, feature_type, start, end, score, strand, phase, attributes = parts
            
            # Parse the attributes field into a dictionary
            attr_dict = {}
            for attr in attributes.split(";"):
                if "=" in attr:
                    key, value = attr.split("=", 1)
                    attr_dict[key] = value
            
            # Use the 'ID' attribute as the key if it exists
            feature_id = attr_dict.get("ID", f"{chrom}:{start}-{end}_{feature_type}")
            
            if feature_type == "exon":
                # Store the feature's details in the dictionary
                gff_dict[feature_id] = (chrom, int(start), int(end),strand)
    
    print(f"{len(gff_dict)} exons parsed from the GFF3 file.")
    return gff_dict

# Example usage:
gff_dict = gff3_to_dict("Athaliana_167_TAIR10.gene_exons.gff3")
print(dict(list(gff_dict.items())[0: 5]))

207465 exons parsed from the GFF3 file.
{'AT1G01010.1.TAIR10.exon.1': ('Chr1', 3631, 3913, '+'), 'AT1G01010.1.TAIR10.exon.2': ('Chr1', 3996, 4276, '+'), 'AT1G01010.1.TAIR10.exon.3': ('Chr1', 4486, 4605, '+'), 'AT1G01010.1.TAIR10.exon.4': ('Chr1', 4706, 5095, '+'), 'AT1G01010.1.TAIR10.exon.5': ('Chr1', 5174, 5326, '+')}


In [13]:
import itertools
#print 3 randomly selected keys
test_keys = list(itertools.islice(gff_dict, 3))
print(test_keys)

['AT1G01010.1.TAIR10.exon.1', 'AT1G01010.1.TAIR10.exon.2', 'AT1G01010.1.TAIR10.exon.3']


In [14]:
gff_dict[test_keys[1]]

('Chr1', 3996, 4276, '+')

In [15]:
genome['Chr1'][3996:4276]

'CCAGTCAAAGTACAAATCGAGAGATGCTATGTGGTACTTCTTCTCTCGTAGAGAAAACAACAAAGGGAATCGACAGAGCAGGACAACGGTTTCTGGTAAATGGAAGCTTACCGGAGAATCTGTTGAGGTCAAGGACCAGTGGGGATTTTGTAGTGAGGGCTTTCGTGGTAAGATTGGTCATAAAAGGGTTTTGGTGTTCCTCGATGGAAGATACCCTGACAAAACCAAATCTGATTGGGTTATCCACGAGTTCCACTACGACCTCTTACCAGAACATCAG'

In [16]:
from Bio.Seq import Seq

def _rev(seq, strand):
    # reverse complement
    if strand == '-':
        return str(Seq(seq).reverse_complement())
    else:
        return seq

In [21]:
_rev(genome['Chr1'][3996:4276], "-")

'CTGATGTTCTGGTAAGAGGTCGTAGTGGAACTCGTGGATAACCCAATCAGATTTGGTTTTGTCAGGGTATCTTCCATCGAGGAACACCAAAACCCTTTTATGACCAATCTTACCACGAAAGCCCTCACTACAAAATCCCCACTGGTCCTTGACCTCAACAGATTCTCCGGTAAGCTTCCATTTACCAGAAACCGTTGTCCTGCTCTGTCGATTCCCTTTGTTGTTTTCTCTACGAGAGAAGAAGTACCACATAGCATCTCTCGATTTGTACTTTGACTGG'

In [22]:
exon_df = pd.DataFrame.from_dict(gff_dict, orient='index', columns=['region','start','end','strand']).rename_axis('id')
exon_df.to_csv("positive.csv")
exon_df.head()

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT1G01010.1.TAIR10.exon.1,Chr1,3631,3913,+
AT1G01010.1.TAIR10.exon.2,Chr1,3996,4276,+
AT1G01010.1.TAIR10.exon.3,Chr1,4486,4605,+
AT1G01010.1.TAIR10.exon.4,Chr1,4706,5095,+
AT1G01010.1.TAIR10.exon.5,Chr1,5174,5326,+


## Get non-promoters

In [None]:
import numpy as np
def get_chr_names_and_lengths():
    chr_lengths = {}
    for chromosome in genome.keys():
        chr_lengths[chromosome] = len(genome[chromosome])

    # check that all lengths are different from 0
    assert all(x != 0 for x in chr_lengths.values())

    return chr_lengths

def get_random_chr(chr_names_and_lengths):
    chr_lengths = pd.Series(chr_names_and_lengths.values())
    chr_probs = chr_lengths / chr_lengths.sum()
    chr_names = list(chr_names_and_lengths.keys())
    return chr_names[np.argwhere(np.random.multinomial(1, chr_probs))[0][0]]


def is_intersecting(c, pos, df_forbidden):
    intersecting = (df_forbidden['region'] == c) & (df_forbidden['start'].astype(int) <= pos) & (
                df_forbidden['end'].astype(int) >= pos)
    return intersecting.any()


def get_random_pos(df_forbidden: pd.DataFrame, chr_names_and_lengths, offset_from_end):
    c = get_random_chr(chr_names_and_lengths)
    c_len = chr_names_and_lengths[c]
    pos = np.random.randint(c_len - offset_from_end) + 1

    while is_intersecting(c, pos, df_forbidden):
        pos = np.random.randint(c_len) + 1

    return c, pos

chr_names_and_lengths = get_chr_names_and_lengths()
print(chr_names_and_lengths)
get_random_pos(exon_df, chr_names_and_lengths, 0)

{'Chr1': 30427671, 'Chr2': 19698289, 'Chr3': 23459830, 'Chr4': 18585056, 'Chr5': 26975502}


('Chr1', 26722193)

In [30]:
import numpy as np
import pandas as pd

def get_chr_names_and_lengths(genome):
    # Use a dictionary comprehension for faster execution
    chr_lengths = {chromosome: len(seq) for chromosome, seq in genome.items()}

    # Ensure all lengths are non-zero
    if not all(chr_lengths.values()):
        raise ValueError("All chromosome lengths must be greater than 0.")

    return chr_lengths

def get_random_chr(chr_names_and_lengths):
    # Pre-compute probabilities and use NumPy for sampling
    chr_names = np.array(list(chr_names_and_lengths.keys()))
    chr_lengths = np.array(list(chr_names_and_lengths.values()))
    chr_probs = chr_lengths / chr_lengths.sum()
    
    # Use np.random.choice for efficient sampling
    return np.random.choice(chr_names, p=chr_probs)

def is_intersecting(c, pos, df_forbidden):
    # Use vectorized operations with NumPy and Pandas
    mask = (df_forbidden['region'] == c) & (df_forbidden['start'] <= pos) & (df_forbidden['end'] >= pos)
    return mask.any()

def get_random_pos(df_forbidden: pd.DataFrame, chr_names_and_lengths, offset_from_end):
    while True:
        # Select a random chromosome
        c = get_random_chr(chr_names_and_lengths)
        c_len = chr_names_and_lengths[c]

        # Generate a random position within the valid range
        pos = np.random.randint(1, c_len - offset_from_end + 1)

        # Check if the position intersects with forbidden regions
        if not is_intersecting(c, pos, df_forbidden):
            return c, pos

chr_names_and_lengths = get_chr_names_and_lengths(genome)
print(chr_names_and_lengths)
random_chromosome, random_position = get_random_pos(exon_df, chr_names_and_lengths, 0)
print(random_chromosome, random_position)

{'Chr1': 30427671, 'Chr2': 19698289, 'Chr3': 23459830, 'Chr4': 18585056, 'Chr5': 26975502}
Chr5 3280448


In [31]:
num_seqs = len(exon_df)
negative_samples = [None] * num_seqs
for i in range(num_seqs):
    while True:
        seq_length = exon_df.iloc[i]['end'] - exon_df.iloc[i]['start']
        chrom, start = get_random_pos(exon_df, chr_names_and_lengths, seq_length)
        end = start + seq_length
        seq = genome[chrom][start:end]
        if 'N' not in seq.upper():
            negative_samples[i] = [chrom, start, end, '+']
            break
    if i % 3000 == 0:
        print(f"Generated {i} negative samples")
neg_df = pd.DataFrame(negative_samples, columns=['region', 'start', 'end', 'strand'])
neg_df

Generated 0 negative samples
Generated 3000 negative samples
Generated 6000 negative samples
Generated 9000 negative samples
Generated 12000 negative samples
Generated 15000 negative samples
Generated 18000 negative samples
Generated 21000 negative samples
Generated 24000 negative samples
Generated 27000 negative samples
Generated 30000 negative samples
Generated 33000 negative samples
Generated 36000 negative samples
Generated 39000 negative samples
Generated 42000 negative samples
Generated 45000 negative samples
Generated 48000 negative samples
Generated 51000 negative samples
Generated 54000 negative samples
Generated 57000 negative samples
Generated 60000 negative samples
Generated 63000 negative samples
Generated 66000 negative samples
Generated 69000 negative samples
Generated 72000 negative samples
Generated 75000 negative samples
Generated 78000 negative samples
Generated 81000 negative samples
Generated 84000 negative samples
Generated 87000 negative samples
Generated 90000 n

Unnamed: 0,region,start,end,strand
0,Chr4,9698114,9698396,+
1,Chr2,13818447,13818727,+
2,Chr1,23336566,23336685,+
3,Chr4,6120321,6120710,+
4,Chr2,1588104,1588256,+
...,...,...,...,...
207460,Chr2,589686,590123,+
207461,Chr1,15297434,15299017,+
207462,Chr1,583489,583824,+
207463,Chr1,57257,57574,+


In [32]:
neg_df.index.name = "id"
neg_df.to_csv("negative.csv", index=True)

In [None]:
exons = pd.read_csv("positive.csv")
non_exons = pd.read_csv("negative.csv")

exons['region'] = "chr" + exons['region']
non_exons['region'] = "chr" + non_exons['region']

## Train/test split

In [None]:
train_exons, test_exons = train_test_split(exons, shuffle=True, random_state=42)
train_exons.shape, test_exons.shape

((155598, 5), (51867, 5))

In [None]:
train_nonexons, test_nonexons = train_test_split(non_exons, shuffle=True, random_state=42)
train_nonexons.shape, test_nonexons.shape

((155598, 5), (51867, 5))

## YAML file

In [37]:
BASE_FILE_PATH = Path("../../datasets/demo_arabidopsis_exons/")

(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [39]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'positive': {
                'type': 'fa.gz',
                'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz',
            },    
            'negative': {
                'type': 'fa.gz',
                'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz',
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'positive': {'type': 'fa.gz',
   'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz'},
  'negative': {'type': 'fa.gz',
   'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz'}}}

## CSV files

In [None]:
train_exons.to_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz', index=False, compression='gzip')
train_nonexons.to_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz', index=False, compression='gzip')
test_exons.to_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz', index=False, compression='gzip')
test_nonexons.to_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz', index=False, compression='gzip')

## Cleaning

In [None]:
!rm positive.csv negative.csv