In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#from genomic_benchmarks.seq2loc import fasta2loc
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import yaml
import gzip
from Bio import SeqIO

def _fastagz2dict(fasta_path, fasta_total=None, stop_id=None, region_name_transform=lambda x: x):
    # load gzipped fasta into dictionary
    fasta = {}

    with gzip.open(fasta_path, "rt") as handle:
        for record in tqdm(SeqIO.parse(handle, "fasta"), total=fasta_total):
            fasta[region_name_transform(record.id)] = str(record.seq)
            if stop_id and (record.id == stop_id):
                # stop, do not read small contigs
                break
    return fasta

In [3]:
from tqdm.autonotebook import tqdm as tdm

def fasta2loc(fasta_path, ref_dict, use_seq_ids=True):

    tree = {}
    nseqs = 0

    # building tree for seq searching
    for seq in SeqIO.parse(open(fasta_path, "r"), "fasta"):
        s = str(seq.seq)
        rev = str(seq.seq.reverse_complement())
        if use_seq_ids:
            sname = seq.name
        else:
            sname = s
        nseqs += 1

        _update_tree(tree, s, sname, "+")
        _update_tree(tree, rev, sname, "-")

    print(f"{nseqs} sequences read and parsed.")

    results = {}

    for chrom in tdm(ref_dict):
        curr_positions = []
        # print(f"Processing chrom {chrom}.")

        for i, c in tdm(enumerate(ref_dict[chrom]), total=len(ref_dict[chrom]), leave=False):

            prev_positions = curr_positions + [tree]
            curr_positions = []

            for pos in prev_positions:
                if c in pos:
                    pos = pos[c]
                    curr_positions.append(pos)

                    if "terminal" in pos:
                        results[pos["terminal"][0]] = (chrom, i - pos["terminal"][2] + 1, i + 1, pos["terminal"][1])

    print(f"{len(results.keys())} sequences found in the reference.")

    return results


def _update_tree(root, seq_str, seq_name, direction):
    # updates tree in `root` with a sequence `seq_str`
    position = root

    for c in seq_str:
        if c in position:
            position = position[c]
        else:
            position[c] = {}
            position = position[c]
    position["terminal"] = (seq_name, direction, len(seq_str))

  from tqdm.autonotebook import tqdm as tdm


## Load genomic reference and download data from GitHub

In [4]:
genome = _fastagz2dict("./Athaliana_167_TAIR9.fa.gz",
                      stop_id='Chr5')
genome.keys()

0it [00:00, ?it/s]

dict_keys(['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5'])

## Get promoters

### A few checks

In [6]:
def gff3_to_dict(gff3_path):
    gff_dict = {}
    with open(gff3_path, "r") as gff3_file:
        for line in gff3_file:
            # Skip comments and metadata lines
            if line.startswith("#"):
                continue
            
            # Parse the line into fields
            parts = line.strip().split("\t")
            if len(parts) < 9:
                continue
            
            chrom, source, feature_type, start, end, score, strand, phase, attributes = parts

            if int(end) - int(start) < 5:
                continue
            
            # Parse the attributes field into a dictionary
            attr_dict = {}
            for attr in attributes.split(";"):
                if "=" in attr:
                    key, value = attr.split("=", 1)
                    attr_dict[key] = value
            
            # Use the 'ID' attribute as the key if it exists
            feature_id = attr_dict.get("ID", f"{chrom}:{start}-{end}_{feature_type}")
            
            if feature_type == 'three_prime_UTR' or feature_type == 'five_prime_UTR':
                # Store the feature's details in the dictionary
                gff_dict[feature_id] = (chrom, int(start), int(end),strand)
    
    print(f"{len(gff_dict)} UTRS parsed from the GFF3 file.")
    return gff_dict

# Example usage:
gff_dict = gff3_to_dict("Athaliana_167_TAIR10.gene_exons.gff3")
print(dict(list(gff_dict.items())[0: 5]))

64216 UTRS parsed from the GFF3 file.
{'AT1G01010.1.TAIR10.five_prime_UTR.1': ('Chr1', 3631, 3759, '+'), 'AT1G01010.1.TAIR10.three_prime_UTR.1': ('Chr1', 5631, 5899, '+'), 'AT1G01020.1.TAIR10.five_prime_UTR.1': ('Chr1', 8667, 8737, '-'), 'AT1G01020.1.TAIR10.three_prime_UTR.1': ('Chr1', 6437, 6914, '-'), 'AT1G01020.1.TAIR10.three_prime_UTR.2': ('Chr1', 5928, 6263, '-')}


In [7]:
import itertools
#print 3 randomly selected keys
test_keys = list(itertools.islice(gff_dict, 3))
print(test_keys)

['AT1G01010.1.TAIR10.five_prime_UTR.1', 'AT1G01010.1.TAIR10.three_prime_UTR.1', 'AT1G01020.1.TAIR10.five_prime_UTR.1']


In [8]:
gff_dict[test_keys[1]]

('Chr1', 5631, 5899, '+')

In [9]:
seq1 = genome['Chr1'][5631:5899]
seq1

'AGGTCAAATCGGATTCTTGCTCAAAATTTGTATTTCTTAGAATGTGTGTTTTTTTTTGTTTTTTTTTCTTTGCTCTGTTTTCTCGCTCCGGAAAAGTTTGAAGTTATATTTTATTAGTATGTAAAGAAGAGAAAAAGGGGGAAAGAAGAGAGAAGAAAAATGCAGAAAATCATATATATGAATTGGAAAAAAGTATATGTAATAATAATTAGTGCATCGTTTTGTGGTGTAGTTTATATAAATAAAGTGATATATAGTCTTGTATAAG'

In [10]:
from Bio.Seq import Seq

def _rev(seq, strand):
    # reverse complement
    if strand == '-':
        return str(Seq(seq).reverse_complement())
    else:
        return seq

In [11]:
_rev(seq1, "-")

'CTTATACAAGACTATATATCACTTTATTTATATAAACTACACCACAAAACGATGCACTAATTATTATTACATATACTTTTTTCCAATTCATATATATGATTTTCTGCATTTTTCTTCTCTCTTCTTTCCCCCTTTTTCTCTTCTTTACATACTAATAAAATATAACTTCAAACTTTTCCGGAGCGAGAAAACAGAGCAAAGAAAAAAAAACAAAAAAAAACACACATTCTAAGAAATACAAATTTTGAGCAAGAATCCGATTTGACCT'

In [12]:
utr_df = pd.DataFrame.from_dict(gff_dict, orient='index', columns=['region','start','end','strand']).rename_axis('id')
utr_df.to_csv("positive.csv")
utr_df.head()

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AT1G01010.1.TAIR10.five_prime_UTR.1,Chr1,3631,3759,+
AT1G01010.1.TAIR10.three_prime_UTR.1,Chr1,5631,5899,+
AT1G01020.1.TAIR10.five_prime_UTR.1,Chr1,8667,8737,-
AT1G01020.1.TAIR10.three_prime_UTR.1,Chr1,6437,6914,-
AT1G01020.1.TAIR10.three_prime_UTR.2,Chr1,5928,6263,-


## Get non-UTRS

In [13]:
import numpy as np
import pandas as pd

def get_chr_names_and_lengths(genome):
    # Use a dictionary comprehension for faster execution
    chr_lengths = {chromosome: len(seq) for chromosome, seq in genome.items()}

    # Ensure all lengths are non-zero
    if not all(chr_lengths.values()):
        raise ValueError("All chromosome lengths must be greater than 0.")

    return chr_lengths

def get_random_chr(chr_names_and_lengths):
    # Pre-compute probabilities and use NumPy for sampling
    chr_names = np.array(list(chr_names_and_lengths.keys()))
    chr_lengths = np.array(list(chr_names_and_lengths.values()))
    chr_probs = chr_lengths / chr_lengths.sum()
    
    # Use np.random.choice for efficient sampling
    return np.random.choice(chr_names, p=chr_probs)

def is_intersecting(c, pos, df_forbidden):
    # Use vectorized operations with NumPy and Pandas
    mask = (df_forbidden['region'] == c) & (df_forbidden['start'] <= pos) & (df_forbidden['end'] >= pos)
    return mask.any()

def get_random_pos(df_forbidden: pd.DataFrame, chr_names_and_lengths, offset_from_end):
    while True:
        # Select a random chromosome
        c = get_random_chr(chr_names_and_lengths)
        c_len = chr_names_and_lengths[c]

        # Generate a random position within the valid range
        pos = np.random.randint(1, c_len - offset_from_end + 1)

        # Check if the position intersects with forbidden regions
        if not is_intersecting(c, pos, df_forbidden):
            return c, pos

chr_names_and_lengths = get_chr_names_and_lengths(genome)
print(chr_names_and_lengths)
random_chromosome, random_position = get_random_pos(utr_df, chr_names_and_lengths, 0)
print(random_chromosome, random_position)

{'Chr1': 30427671, 'Chr2': 19698289, 'Chr3': 23459830, 'Chr4': 18585056, 'Chr5': 26975502}
Chr4 15014359


In [14]:
num_seqs = len(utr_df)
negative_samples = [None] * num_seqs
for i in tqdm(range(num_seqs), desc="Generating negative samples", mininterval=6):
    while True:
        seq_length = utr_df.iloc[i]['end'] - utr_df.iloc[i]['start']
        chrom, start = get_random_pos(utr_df, chr_names_and_lengths, seq_length)
        end = start + seq_length
        seq = genome[chrom][start:end]
        if 'N' not in seq.upper():
            negative_samples[i] = [chrom, start, end, '+']
            break
neg_df = pd.DataFrame(negative_samples, columns=['region', 'start', 'end', 'strand'])
neg_df

Generating negative samples:   0%|          | 0/64216 [00:00<?, ?it/s]

Unnamed: 0,region,start,end,strand
0,Chr4,3436630,3436758,+
1,Chr1,7832231,7832499,+
2,Chr3,12565705,12565775,+
3,Chr1,15143773,15144250,+
4,Chr4,17970124,17970459,+
...,...,...,...,...
64211,Chr4,7513821,7513913,+
64212,Chr5,18161672,18161834,+
64213,Chr3,721643,722291,+
64214,Chr2,18746219,18746464,+


In [16]:
neg_df.index.name = "id"
neg_df.to_csv("negative.csv", index=True)

In [17]:
utrs_csv = pd.read_csv("positive.csv")
nonutrs_csv = pd.read_csv("negative.csv")

## Train/test split

In [18]:
train_utrs, train_utrs = train_test_split(utrs_csv, shuffle=True, random_state=42)
train_utrs.shape, train_utrs.shape

((16054, 5), (16054, 5))

In [19]:
train_nonutrs, test_nonutrs = train_test_split(nonutrs_csv, shuffle=True, random_state=42)
train_nonutrs.shape, test_nonutrs.shape

((48162, 5), (16054, 5))

## YAML file

In [20]:
BASE_FILE_PATH = Path("../../datasets/demo_arabidopsis_utrs/")

(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [21]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'positive': {
                'type': 'fa.gz',
                'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz',
            },    
            'negative': {
                'type': 'fa.gz',
                'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz',
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'positive': {'type': 'fa.gz',
   'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz'},
  'negative': {'type': 'fa.gz',
   'url': 'https://raw.githubusercontent.com/framazan/files/master/Athaliana_167_TAIR9.fa.gz'}}}

## CSV files

In [22]:
train_utrs.to_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz', index=False, compression='gzip')
train_nonutrs.to_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz', index=False, compression='gzip')
train_utrs.to_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz', index=False, compression='gzip')
test_nonutrs.to_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz', index=False, compression='gzip')

## Cleaning

In [23]:
!rm positive.csv negative.csv