Make the Model

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier


data = pd.read_csv("sampledata.csv")
predictors = ["demographics - age", "demographics - is_white", "demographics - is_veteran"]
data = data.dropna()

X = data[predictors]
y = data.drop(columns=predictors)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)


pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    MultiOutputClassifier(DecisionTreeClassifier())
)


model = pipeline.fit(X_train, y_train)


def predict_top_pathogens(features_df, top_n=6):
    probabilities = []
    for i, estimator in enumerate(model.named_steps['multioutputclassifier'].estimators_):
        prob = estimator.predict_proba(model.named_steps['onehotencoder'].transform(features_df))[:, 1]
        probabilities.append(prob[0])
    
    result_df = pd.DataFrame({
        'Pathogen': y,
        'Probability': probabilities
    })
    
    result_df = result_df.sort_values(by='Probability', ascending=False)
    y_pred = model.predict(features_df)
    binary_predictions = y_pred[0] if len(y_pred.shape) > 1 else y_pred
    result_df['Predicted_Present'] = [binary_predictions[y.index(p)] for p in result_df['Pathogen']]
    return result_df.head(min(top_n, len(y)))


def calculate_hamming_loss(y_true, y_pred):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values

    incorrect_predictions = 0
    total_predictions = y_true.shape[0] * y_true.shape[1]
    
    for i in range(y_true.shape[0]):
        for j in range(y_true.shape[1]):
            if y_true[i, j] != y_pred[i, j]:
                incorrect_predictions += 1
    
    return incorrect_predictions / total_predictions


def evaluate_model():

    y_pred = model.predict(X_test)
    
    for i, pathogen in enumerate(y):
        y_test_pathogen = y_test[pathogen].values
        y_pred_pathogen = y_pred[:, i]
        

        accuracy = accuracy_score(y_test_pathogen, y_pred_pathogen)
        print(f"{pathogen} Accuracy: {accuracy:.4f}")
    
    hl = calculate_hamming_loss(y_test, y_pred)
    print(f"Overall Hamming Loss: {hl:.4f}")


test_sample = X_test.iloc[[16]]
top_pathogens = predict_top_pathogens(test_sample)
print("Top pathogens predicted for test sample:")
print(top_pathogens)


print("\nModel Evaluation:")
evaluate_model()

ValueError: Unknown label type: continuous-multioutput. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.