In [None]:
# Installs
%pip install biopython
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install matplotlib
%pip install seaborn

In [24]:
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view
import numpy as np
from Bio import SeqIO
from Bio.Seq import Seq
import os
import random
import time
#import matplotlib.pyplot as plt
#import seaborn as sns
#from sklearn.metrics import classification_report
#from sklearn.metrics import accuracy_score
#from sklearn.model_selection import train_test_split
#from astropy.stats import bayesian_blocks
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [25]:
# => Utility functions
# Function to read text files of sequences
def read_seq_file(in_path):
    with open(os.path.abspath(os.path.expanduser(in_path)), "r") as f:
        sequences = f.read().splitlines()
    return sequences

# Function to generate list of false sequences
def generate_false_sequences(true_seq):
    false_seq = []
    for seq in true_seq:
        x = list(seq)
        random.shuffle(x)
        false_seq.append(''.join(x))
    return false_seq

# Function to vectorize each sequence, produces 1d flattened vectors
def one_hot_encoder(seq):
    integer_encoded = LabelEncoder().fit(np.array(["A", "C", "G", "T"])).transform(list(seq)).reshape(len(list(seq)), 1)
    return OneHotEncoder(sparse=False, dtype=int, categories=[range(4)]).fit_transform(integer_encoded).flatten()


# Function to convert classification prediction into genomic intervals
def generate_genomic_locations(classes, proba, window_size):
    loc_pairs = []
    classes_len = classes.shape[0]
    counter = 0
    while counter < classes_len:
        pred_class = classes[counter]
        if pred_class != 0:
            loc_pairs.append([counter + 1, counter + window_size, "+" if pred_class == 1 else "-"])
            loc_pairs[-1].extend(proba[counter].tolist())
        counter += 1
    return loc_pairs


# Function to convert locations into GFF format
def generate_gff_df(locations, seqid, score_filter=0.001):
    column_names = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attribute"]
    strand_func = lambda x: "F" if x == "+" else "R"
    # small function to generate attributes
    attr_func = lambda row: \
        f"id={row['seqid']}_{strand_func(row['strand'])}_prom_{row.name}" \
        f";name={row['seqid']}_{strand_func(row['strand'])}_prom_{row.name}" \
        f";true_proba={row['true_proba']}" \
        f";true_rc_proba={row['true_rc_proba']}" \
        f";false_proba={row['false_proba']}"

    gff_df = pd.DataFrame(locations, columns=["start", "end", "strand", "true_proba", "true_rc_proba", "false_proba"])
    f_gff_df = gff_df[(gff_df["strand"] == "+") & (gff_df["true_proba"] <= score_filter)].copy()
    r_gff_df = gff_df[(gff_df["strand"] == "-") & (gff_df["true_rc_proba"] <= score_filter)].copy()
    gff_df = f_gff_df.append(r_gff_df, ignore_index=True)
    gff_df["seqid"] = seqid
    gff_df["source"] = "ML_promoters_predictor"
    gff_df["type"] = "predicted_promoter"
    gff_df["score"] = "."
    gff_df["phase"] = "."

    for i in gff_df.index:
        gff_df.at[i, "attribute"] = attr_func(gff_df.loc[i])
    gff_df.drop(["true_proba", "false_proba"], inplace=True, axis=1)
    gff_df = gff_df.reindex(columns=column_names)
    return gff_df

In [26]:
# Inputs

genome_file = "./GCF_000005845.2_ASM584v2_genomic.fa"
true_bs_file = "./sigE_binding_sites.txt"
false_bs_file = "./dummy_seq.txt"
save_dir = "./"
iterations = 500
window_size = 29
threads = 40
score_filter = 0.001

In [27]:
# Load input files
save_dir = os.path.abspath(save_dir)
genome_file_parsed = SeqIO.parse(os.path.abspath(genome_file), "fasta")
true_sequences = read_seq_file(true_bs_file)
true_rc_sequences = [str(Seq(s).reverse_complement()) for s in true_sequences]
false_sequences = generate_false_sequences(true_sequences)

In [28]:
# Vectorize training data sets in a parallel mode and make dataframes of it

true_dataset_arr = np.array(list(map(one_hot_encoder, true_sequences)), dtype=np.int32)
true_rc_dataset_arr = np.array(list(map(one_hot_encoder, true_rc_sequences)), dtype=np.int32)
false_dataset_arr = np.array(list(map(one_hot_encoder, false_sequences)), dtype=np.int32)

In [29]:
# Concatenate training datasets
full_arr = np.concatenate([true_dataset_arr, true_rc_dataset_arr, false_dataset_arr], axis=0)
labels = ([1] * true_dataset_arr.shape[0]) + ([-1] * true_rc_dataset_arr.shape[0]) + ([0] * false_dataset_arr.shape[0])

In [30]:
# Prepare the data for prediction
data_to_predict = {}
for seq_rec in genome_file_parsed:
    print(f"==> Preparing: {seq_rec.id}")
    # Vectorize the genomic sequence
    chrom_vector = one_hot_encoder(str(seq_rec.seq))
    # Convert vectorized genome to array of sliding windows
    data_to_predict[seq_rec.id] = sliding_window_view(chrom_vector, window_size * 4)[::4]
    del chrom_vector # free some memory space

==> Preparing: NC_000913.3


In [31]:
models = {"RandomForest": RandomForestClassifier(n_jobs=threads),
          "GradientBoosting": GradientBoostingClassifier(),
          "AdaBoost": AdaBoostClassifier(),
          "MultiLayerPerceptron": MLPClassifier(max_iter=iterations),
          "GaussianNaiveBayes": GaussianNB(),
          "DecisionTree": DecisionTreeClassifier(),
          "KNearestNeighbors": KNeighborsClassifier(n_jobs=threads)}
thresholds = {"RandomForest": 0.05,
          "GradientBoosting": 0,
          "AdaBoost": 0,
          "MultiLayerPerceptron": 0,
          "GaussianNaiveBayes": 0,
          "DecisionTree": 0,
          "KNearestNeighbors": 0}

for model_name, model in models.items():
    t = time.time()
    # Train
    print(f"==> Training {model_name} model")
    model.fit(full_arr, labels)
    save_df = pd.DataFrame()
    probas = np.empty((0, 3), float)
    for seqid in data_to_predict.keys():
        print(f"===> Predicting for: {seqid} using {model_name} model")
        # Classify
        # predict = model.predict(data_to_predict[seqid])
        
        # Get classification probabilities
        proba = model.predict_proba(data_to_predict[seqid])
        probas = np.vstack((probas, proba))
        # Get classes from propabilities
        predict_classes = np.array([model.classes_[i] for i in np.argmax(proba, axis=1)])
        # Make genomic coordenates from the classification result
        locations = generate_genomic_locations(predict_classes, proba, window_size)
        # Convert to GFF format
        gff = generate_gff_df(locations, seq_rec.id, thresholds[model_name])
        save_df = save_df.append(gff, ignore_index=True)
    print(f"Time elapsed for {model_name} model: {round((time.time() - t) / 60, 2)} minutes")
    
    print(f"Predicted motifs count: {save_df.shape[0]}")
    
    save_df.sort_values(by=["seqid", "start", "end"], inplace=True)
    print("Saving GFF")
    save_df.to_csv(os.path.abspath(f"{save_dir}/predicted_promoters_sk_{model_name}.gff"),
                   sep="\t", header=False, index=False)
    break

==> Training RandomForest model
===> Predicting for: NC_000913.3 using RandomForest model
Time elapsed for RandomForest model: 0.59 minutes
Predicted motifs count: 72071
Saving GFF
