In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#from genomic_benchmarks.seq2loc import fasta2loc
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import yaml
import gzip
from Bio import SeqIO

def _fastagz2dict(fasta_path, fasta_total=None, stop_id=None, region_name_transform=lambda x: x):
    # load gzipped fasta into dictionary
    fasta = {}

    with gzip.open(fasta_path, "rt") as handle:
        for record in tqdm(SeqIO.parse(handle, "fasta"), total=fasta_total):
            fasta[region_name_transform(record.id)] = str(record.seq)
            if stop_id and (record.id == stop_id):
                # stop, do not read small contigs
                break
    return fasta

In [3]:
from tqdm.autonotebook import tqdm as tdm

def fasta2loc(fasta_path, ref_dict, use_seq_ids=True):

    tree = {}
    nseqs = 0

    # building tree for seq searching
    for seq in SeqIO.parse(open(fasta_path, "r"), "fasta"):
        s = str(seq.seq)
        rev = str(seq.seq.reverse_complement())
        if use_seq_ids:
            sname = seq.name
        else:
            sname = s
        nseqs += 1

        _update_tree(tree, s, sname, "+")
        _update_tree(tree, rev, sname, "-")

    print(f"{nseqs} sequences read and parsed.")

    results = {}

    for chrom in tdm(ref_dict):
        curr_positions = []
        # print(f"Processing chrom {chrom}.")

        for i, c in tdm(enumerate(ref_dict[chrom]), total=len(ref_dict[chrom]), leave=False):

            prev_positions = curr_positions + [tree]
            curr_positions = []

            for pos in prev_positions:
                if c in pos:
                    pos = pos[c]
                    curr_positions.append(pos)

                    if "terminal" in pos:
                        results[pos["terminal"][0]] = (chrom, i - pos["terminal"][2] + 1, i + 1, pos["terminal"][1])

    print(f"{len(results.keys())} sequences found in the reference.")

    return results


def _update_tree(root, seq_str, seq_name, direction):
    # updates tree in `root` with a sequence `seq_str`
    position = root

    for c in seq_str:
        if c in position:
            position = position[c]
        else:
            position[c] = {}
            position = position[c]
    position["terminal"] = (seq_name, direction, len(seq_str))

  from tqdm.autonotebook import tqdm as tdm


## Load genomic reference and download data from GitHub

In [5]:
genome = _fastagz2dict("./Athaliana_167_TAIR9.fa.gz",
                      7)
genome.keys()

  0%|          | 0/7 [00:00<?, ?it/s]

dict_keys(['Chr1', 'Chr2', 'Chr3', 'Chr4', 'Chr5', 'ChrM', 'ChrC'])

## Get coding sequences

In [7]:
# slow!
import shutil
with gzip.open('./Athaliana_167_TAIR10.cds.fa.gz', 'rb') as f_in:
  with open('./Athaliana_167_TAIR10.cds.fa', 'wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [9]:
coding_seqs = fasta2loc("./Athaliana_167_TAIR10.cds.fa", genome)

35386 sequences read and parsed.


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/30427671 [00:00<?, ?it/s]

  0%|          | 0/19698289 [00:00<?, ?it/s]

  0%|          | 0/23459830 [00:00<?, ?it/s]

  0%|          | 0/18585056 [00:00<?, ?it/s]

  0%|          | 0/26975502 [00:00<?, ?it/s]

  0%|          | 0/366924 [00:00<?, ?it/s]

  0%|          | 0/154478 [00:00<?, ?it/s]

6954 sequences found in the reference.


### A few checks

In [10]:
len(coding_seqs.keys())

6954

In [14]:
import itertools
#print 3 randomly selected keys
test_keys = list(itertools.islice(coding_seqs.keys(), 3))
print(test_keys)

['AT1G01030.1', 'AT1G01073.1', 'AT1G01115.1']


In [20]:
coding_seqs[test_keys[1]]

('Chr1', 44676, 44787, '+')

In [21]:
genome['Chr1'][44676:44787]

'ATGCTTATCAGTATCAGCCCACTGATATTTGTGATACCAGTATCATCTGACGTGGCTTCTTCTGATTGGTTACATTTGACAAAAGCAAAAAATATTATATATATTTATTAA'

In [19]:
coding_seqs['AT1G01115.1']

('Chr1', 56623, 56740, '+')

In [10]:
from Bio.Seq import Seq

def _rev(seq, strand):
    # reverse complement
    if strand == '-':
        return str(Seq(seq).reverse_complement())
    else:
        return seq

In [11]:
_rev(genome['1'][959200:959451], "-")

'GCATCTGGGCCCCACCGGGGCTGCCCGCACCGAGCACGCGAACGCGCCCTCCCGCCCTGAGGCCGCCGGCGTTGCGGTCGGAGAACCATAGAGCCACTCGGCTGGGCGTGGCGCGGCGGGGCGGGGAAAGGGGCGGGGCCTGGGCGGCGGAAGTGCGCAGCCGCGCGGCATTCTGGGGCCGGAAGTGGGGTGCACGCTTCGGGTTGGTGTCATGGCAGCTGCGGGGAGCCGCAAGAGGTAAGCCGCGGGTC'

In [12]:
promoters_df = pd.DataFrame.from_dict(promoters, orient='index', columns=['region','start','end','strand']).rename_axis('id')
promoters_df.to_csv("positive.csv")
promoters_df.head()

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FP000001,1,925542,925793,+
FP000002,1,939072,939323,+
FP000003,1,959200,959451,-
FP000004,1,960383,960634,+
FP000005,1,966281,966532,+


## Get non-promoters

In [13]:
# slow!
nonpromoters = fasta2loc("./human_nonprom_big.fa", genome, use_seq_ids=False)

27731 sequences read and parsed.


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/248956422 [00:00<?, ?it/s]

  0%|          | 0/242193529 [00:00<?, ?it/s]

  0%|          | 0/198295559 [00:00<?, ?it/s]

  0%|          | 0/190214555 [00:00<?, ?it/s]

  0%|          | 0/181538259 [00:00<?, ?it/s]

  0%|          | 0/170805979 [00:00<?, ?it/s]

  0%|          | 0/159345973 [00:00<?, ?it/s]

  0%|          | 0/145138636 [00:00<?, ?it/s]

  0%|          | 0/138394717 [00:00<?, ?it/s]

  0%|          | 0/133797422 [00:00<?, ?it/s]

  0%|          | 0/135086622 [00:00<?, ?it/s]

  0%|          | 0/133275309 [00:00<?, ?it/s]

  0%|          | 0/114364328 [00:00<?, ?it/s]

  0%|          | 0/107043718 [00:00<?, ?it/s]

  0%|          | 0/101991189 [00:00<?, ?it/s]

  0%|          | 0/90338345 [00:00<?, ?it/s]

  0%|          | 0/83257441 [00:00<?, ?it/s]

  0%|          | 0/80373285 [00:00<?, ?it/s]

  0%|          | 0/58617616 [00:00<?, ?it/s]

  0%|          | 0/64444167 [00:00<?, ?it/s]

  0%|          | 0/46709983 [00:00<?, ?it/s]

  0%|          | 0/50818468 [00:00<?, ?it/s]

  0%|          | 0/156040895 [00:00<?, ?it/s]

  0%|          | 0/57227415 [00:00<?, ?it/s]

  0%|          | 0/16569 [00:00<?, ?it/s]

16474 sequences found in the reference.


In [14]:
nonpromoters_df = pd.DataFrame.from_dict(nonpromoters, orient='index', columns=['region','start','end','strand']).rename_axis('id')
nonpromoters_df.head()

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AGAGCAGAAGACCGAAAGGTGAGTCGGCCTGCGGACTCTTCCGGCCCGAACTTCTCTTACCTACCCCGCGCTCCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCG,1,67685520,67685771,+
TCGGCCTGCGGACTCTTCCGGCCCGAACTTCTCTTACCTACCCCGCGCTCCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCT,1,67685543,67685794,+
CGGACTCTTCCGGCCCGAACTTCTCTTACCTACCCCGCGCTCCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCTGGCAGCCT,1,67685551,67685802,+
CCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCTGGCAGCCTGTGGCAGGGGCACTCTCGGGACTTCTCACGGGACGCCCGGT,1,67685592,67685843,+
GGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCTGGCAGCCTGTGGCAGGGGCACTCTCGGGACTTCTCACGGGACGCCCGGTCCTTGGGCGTGCAGGGGTCATGGGGGGTGACG,1,67685624,67685875,+


In [15]:
# check one sequence
genome['1'][67685592:67685843] == nonpromoters_df.index[3]

True

In [16]:
nonpromoters_df.reset_index(inplace=True)
nonpromoters_df['id'] = list(range(nonpromoters_df.shape[0]))
nonpromoters_df.head()

Unnamed: 0,id,region,start,end,strand
0,0,1,67685520,67685771,+
1,1,1,67685543,67685794,+
2,2,1,67685551,67685802,+
3,3,1,67685592,67685843,+
4,4,1,67685624,67685875,+


In [17]:
nonpromoters_df.to_csv("negative.csv", index=False)
len(nonpromoters.keys())

16474

In [18]:
promoters_df = pd.read_csv("positive.csv")
nonpromoters_df = pd.read_csv("negative.csv")

promoters_df['region'] = "chr" + promoters_df['region']
nonpromoters_df['region'] = "chr" + nonpromoters_df['region']

## Train/test split

In [19]:
train_promoters, test_promoters = train_test_split(promoters_df, shuffle=True, random_state=42)
train_promoters.shape, test_promoters.shape

((14742, 5), (4915, 5))

In [20]:
train_nonpromoters, test_nonpromoters = train_test_split(nonpromoters_df, shuffle=True, random_state=42)
train_nonpromoters.shape, test_nonpromoters.shape

((12355, 5), (4119, 5))

## YAML file

In [23]:
BASE_FILE_PATH = Path("../../datasets/human_nontata_promoters/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)
    
BASE_FILE_PATH.mkdir()
(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [24]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'positive': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            },    
            'negative': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'positive': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'},
  'negative': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'}}}

## CSV files

In [25]:
train_promoters.to_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz', index=False, compression='gzip')
train_nonpromoters.to_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz', index=False, compression='gzip')
test_promoters.to_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz', index=False, compression='gzip')
test_nonpromoters.to_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz', index=False, compression='gzip')

## Cleaning

In [26]:
!rm human_non_tata.fa human_nonprom_big.fa
!rm positive.csv negative.csv