Make the Model

In [None]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier


# Loading the data
data = pd.read_csv("fake_patient_data.csv")

# Data Preprocessing
predictors = ["Age_Range","Diabetic","Catheter"]
data = data.dropna()
X = data[predictors]
y = data.drop(columns=predictors)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# Model Training
pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    MultiOutputClassifier(DecisionTreeClassifier())
)
model = pipeline.fit(X_train, y_train)


# Model Evaluation
def predict_top_pathogens(features_df, top_n=6):
    probabilities = []
    for i, estimator in enumerate(model.named_steps['multioutputclassifier'].estimators_):
        prob = estimator.predict_proba(model.named_steps['onehotencoder'].transform(features_df))[:, 1]
        probabilities.append(prob[0])

    result_df = pd.DataFrame({
        'Pathogen': y.columns,
        'Probability': probabilities
    })

    result_df = result_df.sort_values(by='Probability', ascending=False)
    y_pred = model.predict(features_df)
    binary_predictions = y_pred[0] if len(y_pred.shape) > 1 else y_pred
    result_df['Predicted_Present'] = [
        binary_predictions[y.columns.get_loc(p)] for p in result_df['Pathogen']
    ]
    result_df['Predicted_Present'] = 1
    result_df['Predicted_Present'] = result_df['Predicted_Present'].astype(bool)
    return result_df.head(min(top_n, len(y.columns)))

# Hamming loss calculation
# A measure that quantifies the fraction of incorrect predictions on a per-label basis across a set of predictions
def calculate_hamming_loss(y_true, y_pred):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values

    incorrect_predictions = 0
    total_predictions = y_true.shape[0] * y_true.shape[1]
    
    for i in range(y_true.shape[0]):
        for j in range(y_true.shape[1]):
            if y_true[i, j] != y_pred[i, j]:
                incorrect_predictions += 1
    
    return incorrect_predictions / total_predictions

# Model Evaluation
def evaluate_model():
    y_pred = model.predict(X_test)
    
    for i, pathogen in enumerate(y):
        y_test_pathogen = y_test[pathogen].values
        y_pred_pathogen = y_pred[:, i]
        

        accuracy = accuracy_score(y_test_pathogen, y_pred_pathogen)
        print(f"{pathogen} Accuracy: {accuracy:.4f}")
    
    hl = calculate_hamming_loss(y_test, y_pred)
    print(f"Overall Hamming Loss: {hl:.4f}")


# Example usage
test_sample = X_test.iloc[[14]]
top_pathogens = predict_top_pathogens(test_sample)
print("Top pathogens predicted for test sample:")
print(top_pathogens)


print("\nModel Evaluation:")
evaluate_model()

       Age_Range Diabetic Catheter E_coli K_pneumoniae P_mirabilis E_faecalis  \
count        100      100      100    100          100         100        100   
unique         6        2        2      2            2           2          2   
top        51-65        N        Y      Y            N           N          N   
freq          22       53       54     52           56          51         55   

       S_saprophyticus P_aeruginosa C_albicans  
count              100          100        100  
unique               2            2          2  
top                  Y            N          N  
freq                50           54         57  
Top pathogens predicted for test sample:
          Pathogen  Probability  Predicted_Present
1     K_pneumoniae          1.0               True
0           E_coli          0.5               True
3       E_faecalis          0.5               True
4  S_saprophyticus          0.5               True
2      P_mirabilis          0.0               True
5 

In [15]:
# imports 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier


# Loading the data
data = pd.read_csv("sampledata.csv")
data['demographics - age'] = pd.cut(data['demographics - age'], bins=[0, 20, 40, 60, 80, 100], labels=['0-20', '21-40', '41-60', '61-80', '81+'])

# Data Preprocessing
predictors = ["demographics - age", "demographics - is_white", "demographics - is_veteran"]
data = data.dropna()
X = data[predictors]
pathogens = data.filter(like="micro - prev organism", axis=1)
# pathogens = pathogens.groupby(lambda x: x.split("  "), axis=1)
y = data[pathogens.columns]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

# Model Training
pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    MultiOutputClassifier(DecisionTreeClassifier())
)
model = pipeline.fit(X_train, y_train)


# Model Evaluation
def predict_top_pathogens(features_df, top_n=6):
    probabilities = []
    for i, estimator in enumerate(model.named_steps['multioutputclassifier'].estimators_):
        prob = estimator.predict_proba(model.named_steps['onehotencoder'].transform(features_df))[:, 1]
        probabilities.append(prob[0])

    result_df = pd.DataFrame({
        'Pathogen': y.columns,
        'Probability': probabilities
    })

    result_df = result_df.sort_values(by='Probability', ascending=False)
    y_pred = model.predict(features_df)
    binary_predictions = y_pred[0] if len(y_pred.shape) > 1 else y_pred
    result_df['Predicted_Present'] = [
        binary_predictions[y.columns.get_loc(p)] for p in result_df['Pathogen']
    ]
    result_df['Predicted_Present'] = 1
    result_df['Predicted_Present'] = result_df['Predicted_Present'].astype(bool)
    return result_df.head(min(top_n, len(y.columns)))

# Hamming loss calculation
# A measure that quantifies the fraction of incorrect predictions on a per-label basis across a set of predictions
def calculate_hamming_loss(y_true, y_pred):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.values

    incorrect_predictions = 0
    total_predictions = y_true.shape[0] * y_true.shape[1]
    
    for i in range(y_true.shape[0]):
        for j in range(y_true.shape[1]):
            if y_true[i, j] != y_pred[i, j]:
                incorrect_predictions += 1
    
    return incorrect_predictions / total_predictions

# Model Evaluation
def evaluate_model():
    y_pred = model.predict(X_test)
    
    for i, pathogen in enumerate(y):
        y_test_pathogen = y_test[pathogen].values
        y_pred_pathogen = y_pred[:, i]
        

        accuracy = accuracy_score(y_test_pathogen, y_pred_pathogen)
        print(f"{pathogen} Accuracy: {accuracy:.4f}")
    
    hl = calculate_hamming_loss(y_test, y_pred)
    print(f"Overall Hamming Loss: {hl:.4f}")


# Example usage
test_sample = X_test.iloc[[14]]
top_pathogens = predict_top_pathogens(test_sample)
print("Top pathogens predicted for test sample:")
print(top_pathogens)


print("\nModel Evaluation:")
evaluate_model()

Top pathogens predicted for test sample:
                                  Pathogen  Probability  Predicted_Present
23   micro - prev organism Escherichia 180     0.150327               True
13    micro - prev organism Escherichia 90     0.124183               True
3     micro - prev organism Escherichia 30     0.058824               True
25  micro - prev organism Enterococcus 180     0.045752               True
18   micro - prev organism Enterococcus 90     0.045752               True
32    micro - prev organism Klebsiella 180     0.039216               True

Model Evaluation:
micro - prev organism Escherichia 14 Accuracy: 0.9942
micro - prev organism Klebsiella 14 Accuracy: 1.0000
micro - prev organism Enterococcus 30 Accuracy: 0.9922
micro - prev organism Escherichia 30 Accuracy: 0.9690
micro - prev organism Staphylococcus 30 Accuracy: 0.9884
micro - prev organism Klebsiella 30 Accuracy: 0.9961
micro - prev organism Proteus 30 Accuracy: 0.9981
micro - prev organism Staph_coag_neg 30