## Import Packages and datasets


In [1]:

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer


In [25]:
import joblib

import pickle

In [6]:

 
data = pd.read_csv('amr_fungi_ml.csv')

In [7]:
data

Unnamed: 0,Species,Country,Speciality,Source,Year,Antifungals,MIC_Interpretation
0,Candida albicans,Ireland,Unknown,Sputum,2010,Fluconazole,Susceptible
1,Candida albicans,Ireland,Unknown,Sputum,2010,Anidulafungin,Susceptible
2,Candida albicans,Ireland,Unknown,Sputum,2010,Micafungin,Susceptible
3,Candida albicans,Ireland,Unknown,Unknown,2010,Caspofungin,Susceptible
4,Candida albicans,Ireland,Unknown,Unknown,2010,Voriconazole,Susceptible
...,...,...,...,...,...,...,...
24693,Candida albicans,USA,Internal Medicine,Blood culture,2020,Caspofungin,Susceptible
24694,Candida albicans,USA,Internal Medicine,Blood culture,2020,Voriconazole,Susceptible
24695,Candida tropicalis,USA,Internal Medicine,Blood culture,2020,Fluconazole,Susceptible
24696,Candida albicans,USA,Internal Medicine,Blood culture,2020,Caspofungin,Susceptible


In [8]:
# Initialize SimpleImputer with 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')

In [9]:
# Perform Simple Imputation
imputed_data = imputer.fit_transform(data)

In [10]:
# Convert the imputed array back to DataFrame
data = pd.DataFrame(imputed_data, columns=data.columns)


In [11]:
# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data)

  data = pd.get_dummies(data)


In [12]:
data

Unnamed: 0,Species_Aspergillus fumigatus,Species_Aspergillus hortai,Species_Candida albicans,Species_Candida glabrata,Species_Candida guilliermondii,Species_Candida krusei,Species_Candida parapsilosis,Species_Candida sojae,Species_Candida tropicalis,Species_Cryptococcus gattii species complex,...,Year_2019,Year_2020,Antifungals_Anidulafungin,Antifungals_Caspofungin,Antifungals_Fluconazole,Antifungals_Micafungin,Antifungals_Voriconazole,MIC_Interpretation_Intermediate,MIC_Interpretation_Resistant,MIC_Interpretation_Susceptible
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24693,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
24694,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,1
24695,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1
24696,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1


In [13]:
# Separate the features (X) and the target variable (y)
X = data.drop(['MIC_Interpretation_Intermediate', 'MIC_Interpretation_Resistant', 'MIC_Interpretation_Susceptible'] , axis=1)
y = data['MIC_Interpretation_Susceptible']

In [14]:
X

Unnamed: 0,Species_Aspergillus fumigatus,Species_Aspergillus hortai,Species_Candida albicans,Species_Candida glabrata,Species_Candida guilliermondii,Species_Candida krusei,Species_Candida parapsilosis,Species_Candida sojae,Species_Candida tropicalis,Species_Cryptococcus gattii species complex,...,Year_2016,Year_2017,Year_2018,Year_2019,Year_2020,Antifungals_Anidulafungin,Antifungals_Caspofungin,Antifungals_Fluconazole,Antifungals_Micafungin,Antifungals_Voriconazole
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24693,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
24694,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
24695,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
24696,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


## Create the Train and Test datasets

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
for column in data.columns:
    print(f"Column: {column}")
    print(data[column].unique())
    print()

Column: Species_Aspergillus fumigatus
[0 1]

Column: Species_Aspergillus hortai
[0 1]

Column: Species_Candida albicans
[1 0]

Column: Species_Candida glabrata
[0 1]

Column: Species_Candida guilliermondii
[0 1]

Column: Species_Candida krusei
[0 1]

Column: Species_Candida parapsilosis
[0 1]

Column: Species_Candida sojae
[0 1]

Column: Species_Candida tropicalis
[0 1]

Column: Species_Cryptococcus gattii species complex
[0 1]

Column: Species_Mucor indicus
[0 1]

Column: Species_Penicillium citrinum
[0 1]

Column: Species_Unspeciated Corpinellus
[0 1]

Column: Species_Unspeciated Cunninghamella
[0 1]

Column: Species_Unspeciated Scopulariopsis
[0 1]

Column: Country_Argentina
[0 1]

Column: Country_Australia
[0 1]

Column: Country_Belgium
[0 1]

Column: Country_Brazil
[0 1]

Column: Country_Canada
[0 1]

Column: Country_Chile
[0 1]

Column: Country_China
[0 1]

Column: Country_Colombia
[0 1]

Column: Country_Costa Rica
[0 1]

Column: Country_Croatia
[0 1]

Column: Country_Czech Repub

In [17]:

# Train the XGBoost classifier
model = xgb.XGBClassifier()
model.fit(X_train, y_train)


In [18]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [19]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")



Accuracy: 0.9834008097165992


In [20]:
# Example prediction
example_data = [
    ['Candida albicans', 'Ireland', 'Intensive Care Unit', 'Sputum', 2010, 'Fluconazole']
]


In [21]:
example_df = pd.DataFrame(example_data, columns=['Species', 'Country', 'Speciality', 'Source', 'Year', 'Antifungal'])
example_df = pd.get_dummies(example_df)

In [22]:

# Ensure the example DataFrame has the same columns as the training data
missing_cols = set(X.columns) - set(example_df.columns)
for col in missing_cols:
    example_df[col] = 0
example_df = example_df[X.columns]


  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0


In [23]:
# Perform the prediction
prediction = model.predict(example_df)
print(f"Prediction: {prediction}")

Prediction: [1]


In [26]:
filename = 'xgb_model_fungi.pkl'
pickle.dump(model, open(filename, 'wb'))