In [None]:
# Installs
%pip install biopython
%pip install pandas
%pip install numpy
%pip install catboost

In [2]:
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
import os
import multiprocessing
from catboost import CatBoostClassifier

In [3]:
# => Utility functions

# Function to read text files of sequences
def read_seq_file(in_path):
    with open(os.path.abspath(os.path.expanduser(in_path)), "r") as f:
        sequences = f.read().splitlines()
    return sequences


# Function to vectorize each sequence, produces 1d flattened vectors
def vectorize_sequence(seq):
    bases = ["A", "C", "G", "T"]
    dum_df = pd.get_dummies(list(seq), dtype=int)
    for base in bases:
        if base not in dum_df.columns:
            dum_df[base] = 0
    dum_df = dum_df.reindex(bases, axis=1)
    return dum_df.to_numpy().flatten()


# Function to generate an array of all vectorized sequences
def generate_sequences_vector_arr(sequences, threads):
    pool = multiprocessing.Pool(threads)
    return np.array(pool.map(vectorize_sequence, sequences), dtype=np.int32)


# Function to convert classification prediction into genomic intervals
def generate_genomic_locations(classes, proba, window_size):
    loc_pairs = []
    classes_len = classes.shape[0]
    counter = 0
    while counter < classes_len:
        pred_class = classes[counter]
        if pred_class != 0:
            loc_pairs.append([counter + 1, counter + window_size, "+" if pred_class == 1 else "-"])
            loc_pairs[-1].extend(proba[counter].tolist())
        counter += 1
    return loc_pairs


# Function to convert locations into GFF format
def generate_gff_df(locations, seqid, score_filter=0.001):
    column_names = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attribute"]
    strand_func = lambda x: "F" if x == "+" else "R"
    # small function to generate attributes
    attr_func = lambda row: \
        f"id={row['seqid']}_{strand_func(row['strand'])}_prom_{row.name}" \
        f";name={row['seqid']}_{strand_func(row['strand'])}_prom_{row.name}" \
        f";true_proba={row['true_proba']}" \
        f";true_rc_proba={row['true_rc_proba']}" \
        f";false_proba={row['false_proba']}"

    gff_df = pd.DataFrame(locations, columns=["start", "end", "strand", "true_proba", "true_rc_proba", "false_proba"])
    f_gff_df = gff_df[(gff_df["strand"] == "+") & (gff_df["true_proba"] <= score_filter)].copy()
    r_gff_df = gff_df[(gff_df["strand"] == "-") & (gff_df["true_rc_proba"] <= score_filter)].copy()
    gff_df = f_gff_df.append(r_gff_df, ignore_index=True)
    gff_df["seqid"] = seqid
    gff_df["source"] = "ML_promoters_predictor"
    gff_df["type"] = "predicted_promoter"
    gff_df["score"] = "."
    gff_df["phase"] = "."

    for i in gff_df.index:
        gff_df.at[i, "attribute"] = attr_func(gff_df.loc[i])
    gff_df.drop(["true_proba", "false_proba"], inplace=True, axis=1)
    gff_df = gff_df.reindex(columns=column_names)
    return gff_df

In [4]:
# Inputs

genome_file = "./GCF_000005845.2_ASM584v2_genomic.fa"
true_bs_file = "./sigE_binding_sites.txt"
false_bs_file = "./dummy_seq.txt"
save_dir = "./"
iterations = 500
window_size = 29
threads = 40
score_filter = 0.001

In [5]:
# Load input files
save_dir = os.path.abspath(save_dir)
genome_file_parsed = SeqIO.parse(os.path.abspath(genome_file), "fasta")
true_sequences = read_seq_file(true_bs_file)
true_rc_sequences = [str(Seq(s).reverse_complement()) for s in true_sequences]
false_sequences = read_seq_file(false_bs_file)

In [6]:
# Vectorize training data sets in a parallel mode and make dataframes of it (takes time!)
true_dataset_arr = generate_sequences_vector_arr(true_sequences, threads)
true_rc_dataset_arr = generate_sequences_vector_arr(true_rc_sequences, threads)
false_dataset_arr = generate_sequences_vector_arr(false_sequences, threads)

In [7]:
# Concatenate training datasets
full_arr = np.concatenate([true_dataset_arr, true_rc_dataset_arr, false_dataset_arr], axis=0)
labels = ([1] * true_dataset_arr.shape[0]) + ([-1] * true_rc_dataset_arr.shape[0]) + ([0] * false_dataset_arr.shape[0])

In [8]:
# Train
model = CatBoostClassifier(iterations=iterations)
model.fit(full_arr, labels, silent=True)
#model.save_model(os.path.abspath(f"{save_dir}/{os.path.basename(true_bs_file)}2.cbm"))

<catboost.core.CatBoostClassifier at 0x7f68c6d206d0>

In [9]:
# Predict
save_df = pd.DataFrame()
for seq_rec in genome_file_parsed:
    print(f"==> Processing: {seq_rec.id}")
    # Vectorize the genomic sequence
    chrom_vector = vectorize_sequence(str(seq_rec.seq))
    # Convert vectorized genome to array of sliding windows
    dataset_arr = sliding_window_view(chrom_vector, window_size * 4)[::4]
    del chrom_vector
    # Classify
    predict = model.predict(dataset_arr, thread_count=threads)
    # Get classification probabilities
    proba = model.predict_proba(dataset_arr, thread_count=threads)
    # Make genomic coordenates from the classification result
    locations = generate_genomic_locations(predict, proba, window_size)
    # Convert to GFF format
    gff = generate_gff_df(locations, seq_rec.id, score_filter)
    save_df = save_df.append(gff, ignore_index=True)
save_df.sort_values(by=["seqid", "start", "end"], inplace=True)
print("Saving GFF")
save_df.to_csv(os.path.abspath(f"{save_dir}/predicted_promoters_catboost.gff"), sep="\t", header=False, index=False)

==> Processing: NC_000913.3
Saving GFF
