In [None]:
import numpy as np
import pandas as pd

# Load training and test data
X_train_pca = np.array(pd.read_csv("data/X_train_pca.txt", delimiter="\t", header=None))
X_test_pca = np.array(pd.read_csv("data/X_test_pca.txt", delimiter="\t", header=None))
Y_train = np.array(pd.read_csv("data/Y_train.txt", delimiter="\t", header=None))
Y_test = np.array(pd.read_csv("data/Y_test.txt", delimiter="\t", header=None))

In [None]:
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from joblib import dump
import os
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

params = {"C": uniform(0.0001, 10)}


def train(X, Y):
    if Y.shape[1] < 2:
        raise Exception("Invalid shape")
    parents = set(Y[:, 0])
    if len(parents) != 1:
        raise Exception("Must be a parent node")

    parent = parents.pop()
    children = set(Y[:, 1])

    # Train multi-class classifier
    if len(children) > 1:
        begin = datetime.datetime.now()

        # Random search for hyper-parameters
        classifier = LogisticRegression()
        random_search = RandomizedSearchCV(
            classifier, params, scoring="accuracy", random_state=42
        )
        random_search.fit(X, Y[:, 1])
        
        # Save classifier with best hyper-parameters
        dump(random_search.best_estimator_, f"{model}/{parent}.joblib")

        end = datetime.datetime.now()
        print(end - begin, parent)

    if len(children) == 1:
        child = list(children)[0]
        dump(child, f"{model}/dictionary/{parent}")

    # Base case
    if Y.shape[1] == 2:
        return

    # Train each child node
    for child in children:
        # Filter to relevant data for node
        indexes = np.in1d(Y[:, 1], child)
        train(X[indexes], Y[indexes, 1:])


model = "lcpn"
try:
    os.mkdir(model)
    os.mkdir(f"{model}/dictionary")
except Exception as e:
    print(e)

train(X_train_pca, Y_train)

In [None]:
from joblib import load
import os


# Mandatory leaf node prediction
def predict(x, model, classifier):
    try:
        classifier = load(f"{model}/{classifier}.joblib")
        prob = classifier.predict_proba(x)
        pred = classifier.classes_[np.argmax(prob)]
        top = np.sort(prob[0])[::-1][:3]
    except Exception:
        # Classifier only has a single class
        pred = [dictionary.get(classifier)]

    return pred

    
model = "lcpn"
Y_pred = np.empty((0, Y_test.shape[1]))

# Load classifiers that only have a single class
files = os.listdir(f"{model}/dictionary")
dictionary = {}
for file in files:
    dictionary[file] = load(f"{model}/dictionary/{file}")

for x in X_test_pca:
    # All samples included at root
    prediction = ["Insecta"]
    p = ["Insecta"]

    # Predict each taxonomic rank for sample
    while p != [None]:
        p = predict([x], model, prediction[-1])
        if p != [None]:
            prediction = np.append(prediction, p)
            
    # Add sample prediction
    Y_pred = np.vstack((Y_pred, prediction))

np.savetxt(f"{model}/Y_pred.txt", Y_pred, delimiter="\t", fmt="%s")


In [None]:
from joblib import load
import os


# Optional leaf node prediction
def predict(x, model, classifier, threshold):
    try:
        classifier = load(f"{model}/{classifier}.joblib")
        prob = classifier.predict_proba(x)
        pred = classifier.classes_[np.argmax(prob)]
        
        # Probability for prediction
        top = np.sort(prob[0])[::-1][:3]

        # Block if probability is not greater than threshold
        if top[0] <= threshold:
            return ""
    except Exception:
        # Classifier only has a single class
        pred = [dictionary.get(classifier)]

    return pred


model = "lcpn"

# Load classifiers that only have a single class
files = os.listdir(f"{model}/dictionary")
dictionary = {}
for file in files:
    dictionary[file] = load(f"{model}/dictionary/{file}")

for threshold in np.arange(0.61, 0.70, 0.01):
    Y_pred = np.empty((0, Y_test.shape[1]))

    for x in X_test_pca:
        # All samples included at root
        prediction = ["Insecta"]
        p = ["Insecta"]

        # Predict each taxonomic rank for sample
        while p != [None]:
            p = predict([x], model, prediction[-1], round(threshold, 2))
            if p != [None]:
                prediction = np.append(prediction, p)

        # Fill in any remaining predictions for sample if blocked
        for i in range(len(prediction), Y_test.shape[1]):
            prediction = np.append(prediction, "")

        # Add sample prediction
        Y_pred = np.vstack((Y_pred, prediction))

    np.savetxt(f"{model}/Y_pred_{threshold:.2f}.txt", Y_pred, delimiter="\t", fmt="%s")
    