This file will be used to see the output for a classifier chain version of the model and used to compare to the binary relevance. It is currently being worked on since, well you can look at the error message.

In [31]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import hamming_loss


# Loading the data
data = pd.read_csv("fake_patient_data.csv")

# Data Preprocessing
data['Age_Range'] = data['Age_Range'].astype('category')

for col in data:
    if col == "Age_Range":
        data[col] = data[col].astype('category')
    elif data[col].dtype == 'object':
        data[col] = data[col].map({'Y': True, 'N': False})
        data[col] = data[col].astype('bool')

predictors = ["Age_Range","Diabetic","Catheter"]
data = data.dropna()
X = data[predictors]
y = data.drop(columns=predictors)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
print(X_test.iloc[[14]])

# Model Training
pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    ClassifierChain(DecisionTreeClassifier())
)
model = pipeline.fit(X_train, y_train)


# Model Evaluation
def predict_top_pathogens(features_df, top_n=6):
    probabilities = []
    for i, estimator in enumerate(model.named_steps['classifierchain'].estimators_):
        prob = estimator.predict_proba(model.named_steps['onehotencoder'].transform(features_df))[:, 1]
        probabilities.append(prob[0])

    result_df = pd.DataFrame({
        'Pathogen': y.columns,
        'Probability': probabilities
    })

    result_df = result_df.sort_values(by='Probability', ascending=False)
    y_pred = model.predict(features_df)
    binary_predictions = y_pred[0] if len(y_pred.shape) > 1 else y_pred
    result_df['Predicted_Present'] = [
        binary_predictions[y.columns.get_loc(p)] for p in result_df['Pathogen']
    ]
    result_df['Predicted_Present'] = 1
    result_df['Predicted_Present'] = result_df['Predicted_Present'].astype(bool)
    return result_df.head(min(top_n, len(y.columns)))

# Hamming loss calculation
# A measure that quantifies the fraction of incorrect predictions on a per-label basis across a set of predictions
def calculate_hamming_loss(y_true, y_pred):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values

    incorrect_predictions = 0
    total_predictions = y_true.shape[0] * y_true.shape[1]
    
    for i in range(y_true.shape[0]):
        for j in range(y_true.shape[1]):
            if y_true[i, j] != y_pred[i, j]:
                incorrect_predictions += 1
    
    return incorrect_predictions / total_predictions

# Model Evaluation
def evaluate_model():
    y_pred = model.predict(X_test)
    
    for i, pathogen in enumerate(y):
        y_test_pathogen = y_test[pathogen].values
        y_pred_pathogen = y_pred[:, i]
        

        accuracy = accuracy_score(y_test_pathogen, y_pred_pathogen)
        print(f"{pathogen} Accuracy: {accuracy:.4f}")
    
    hl = hamming_loss(y_test, y_pred)
    print(f"Overall Hamming Loss: {hl:.4f}")


# Example usage
test_sample = X_test.iloc[[14]]
print(test_sample)
top_pathogens = predict_top_pathogens(test_sample)
print("Top pathogens predicted for test sample:")
print(top_pathogens)


print("\nModel Evaluation:")
evaluate_model()

   Age_Range  Diabetic  Catheter
41       81+     False      True
   Age_Range  Diabetic  Catheter
41       81+     False      True


ValueError: X has 10 features, but DecisionTreeClassifier is expecting 11 features as input.