This can be for working on the project together to make the tree

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("fake_patient_data.csv")
pathogens = ['E_coli', 'K_pneumoniae', 'P_mirabilis', 'E_faecalis', 'S_saprophyticus', 'P_aeruginosa', 'C_albicans']

X = data.drop(columns=pathogens)
for pathogen in pathogens:
    y = data[pathogen]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

    pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    DecisionTreeClassifier()
    )

    model = pipeline.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    conf_matrix = confusion_matrix(y_test, y_pred)

    print(pathogen + f" Confusion Matrix: \n{conf_matrix}")
    print(pathogen + f" Accuracy: {accuracy}")


    feature_importances = model.named_steps['decisiontreeclassifier'].feature_importances_
    feature_names = model.named_steps['onehotencoder'].get_feature_names_out(X_train.columns)
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)


    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(f'Feature Importances for {pathogen}')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
data = pd.read_csv("fake_patient_data.csv")
column = ['Age_Range', 'Diabetic', 'Catheter']
metadata = ['Diabetic', 'Catheter']
X = data.drop(columns=column)
for col in metadata:
    y = data[col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
    pipeline = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    DecisionTreeClassifier())
    model = pipeline.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(col + f" Confusion Matrix: \n{conf_matrix}")
    print(col + f" Accuracy: {accuracy}")


    feature_importances = model.named_steps['decisiontreeclassifier'].feature_importances_
    feature_names = model.named_steps['onehotencoder'].get_feature_names_out(X_train.columns)
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Plot feature importances
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(f'Feature Importances for {col}')
    plt.gca().invert_yaxis()
    plt.show()