## Import Packages and datasets


In [1]:

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer


In [2]:
import joblib

In [3]:

 
data = pd.read_csv('amr_without_genes_ml.csv')

In [4]:
# Initialize SimpleImputer with 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')

In [5]:
# Perform Simple Imputation
imputed_data = imputer.fit_transform(data)

In [6]:
# Convert the imputed array back to DataFrame
data = pd.DataFrame(imputed_data, columns=data.columns)


In [7]:
# Convert categorical variables to numerical using one-hot encoding
data = pd.get_dummies(data)

  data = pd.get_dummies(data)


In [9]:
data

Unnamed: 0,Species_Acinetobacter anitratus,Species_Acinetobacter baumannii,Species_Acinetobacter baylyi,Species_Acinetobacter bereziniae,Species_Acinetobacter calcoaceticus,Species_Acinetobacter dijkshoorniae,Species_Acinetobacter guillouiae,Species_Acinetobacter haemolyticus,Species_Acinetobacter johnsonii,Species_Acinetobacter junii,...,Antibiotics_Penicillin,Antibiotics_Piperacillin.tazobactam,Antibiotics_Quinupristin.dalfopristin,Antibiotics_Teicoplanin,Antibiotics_Tetracycline,Antibiotics_Tigecycline,Antibiotics_Vancomycin,MIC_Interpretation_Intermediate,MIC_Interpretation_Resistant,MIC_Interpretation_Susceptible
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1048571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1048572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1048573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [14]:
# Separate the features (X) and the target variable (y)
X = data.drop(['MIC_Interpretation_Intermediate', 'MIC_Interpretation_Resistant', 'MIC_Interpretation_Susceptible'] , axis=1)
y = data['MIC_Interpretation_Susceptible']

In [15]:
X

Unnamed: 0,Species_Acinetobacter anitratus,Species_Acinetobacter baumannii,Species_Acinetobacter baylyi,Species_Acinetobacter bereziniae,Species_Acinetobacter calcoaceticus,Species_Acinetobacter dijkshoorniae,Species_Acinetobacter guillouiae,Species_Acinetobacter haemolyticus,Species_Acinetobacter johnsonii,Species_Acinetobacter junii,...,Antibiotics_Minocycline,Antibiotics_Moxifloxacin,Antibiotics_Oxacillin,Antibiotics_Penicillin,Antibiotics_Piperacillin.tazobactam,Antibiotics_Quinupristin.dalfopristin,Antibiotics_Teicoplanin,Antibiotics_Tetracycline,Antibiotics_Tigecycline,Antibiotics_Vancomycin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1048571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1048572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1048573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Create the Train and Test datasets

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
for column in data.columns:
    print(f"Column: {column}")
    print(data[column].unique())
    print()

Column: Species_Acinetobacter anitratus
[0 1]

Column: Species_Acinetobacter baumannii
[0 1]

Column: Species_Acinetobacter baylyi
[0 1]

Column: Species_Acinetobacter bereziniae
[0 1]

Column: Species_Acinetobacter calcoaceticus
[0 1]

Column: Species_Acinetobacter dijkshoorniae
[0 1]

Column: Species_Acinetobacter guillouiae
[0 1]

Column: Species_Acinetobacter haemolyticus
[0 1]

Column: Species_Acinetobacter johnsonii
[0 1]

Column: Species_Acinetobacter junii
[0 1]

Column: Species_Acinetobacter lwoffii
[0 1]

Column: Species_Acinetobacter nosocomialis
[0 1]

Column: Species_Acinetobacter parvus
[0 1]

Column: Species_Acinetobacter pitii
[0 1]

Column: Species_Acinetobacter radioresistens
[0 1]

Column: Species_Acinetobacter schindleri
[0 1]

Column: Species_Acinetobacter tjernbergiae
[0 1]

Column: Species_Acinetobacter towneri
[0 1]

Column: Species_Acinetobacter ursingii
[0 1]

Column: Species_Acinetobacter, non-speciated
[0 1]

Column: Species_Aeromonas caviae
[0 1]

Column: S

[0 1]

Column: Species_Serratia rubidaea
[0 1]

Column: Species_Serratia ureilytica
[0 1]

Column: Species_Serratia, non-speciated
[0 1]

Column: Species_Staphylococcus Coagulase Negative
[0 1]

Column: Species_Staphylococcus aureus
[0 1]

Column: Species_Staphylococcus auricularis
[0 1]

Column: Species_Staphylococcus capitis
[0 1]

Column: Species_Staphylococcus caprae
[0 1]

Column: Species_Staphylococcus cohnii
[0 1]

Column: Species_Staphylococcus epidermidis
[0 1]

Column: Species_Staphylococcus haemolyticus
[0 1]

Column: Species_Staphylococcus hominis
[0 1]

Column: Species_Staphylococcus intermedius
[0 1]

Column: Species_Staphylococcus lugdunensis
[0 1]

Column: Species_Staphylococcus pasteuri
[0 1]

Column: Species_Staphylococcus pettenkoferi
[0 1]

Column: Species_Staphylococcus pseudointermedius
[0 1]

Column: Species_Staphylococcus saprophyticus
[0 1]

Column: Species_Staphylococcus schleiferi
[0 1]

Column: Species_Staphylococcus sciuri
[0 1]

Column: Species_Staphylococ

[0 1]

Column: Antibiotics_Oxacillin
[0 1]

Column: Antibiotics_Penicillin
[0 1]

Column: Antibiotics_Piperacillin.tazobactam
[0 1]

Column: Antibiotics_Quinupristin.dalfopristin
[0 1]

Column: Antibiotics_Teicoplanin
[0 1]

Column: Antibiotics_Tetracycline
[0 1]

Column: Antibiotics_Tigecycline
[0 1]

Column: Antibiotics_Vancomycin
[0 1]

Column: MIC_Interpretation_Intermediate
[0 1]

Column: MIC_Interpretation_Resistant
[1 0]

Column: MIC_Interpretation_Susceptible
[0 1]



In [19]:

# Train the XGBoost classifier
model = xgb.XGBClassifier()
model.fit(X_train, y_train)


In [21]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [22]:
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")



Accuracy: 1.0


In [23]:
# Example prediction
example_data = [
    ['Pseudomonas aeruginosa', 'France', 'Emergency Room', 'Urine', 2013, '', 'Ciprofloxacin']
]


In [24]:
example_df = pd.DataFrame(example_data, columns=['Species', 'Country', 'Speciality', 'Source', 'Year', 'Phenotype', 'Antibiotics'])
example_df = pd.get_dummies(example_df)

In [25]:

# Ensure the example DataFrame has the same columns as the training data
missing_cols = set(X.columns) - set(example_df.columns)
for col in missing_cols:
    example_df[col] = 0
example_df = example_df[X.columns]


  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_df[col] = 0
  example_

In [26]:
# Perform the prediction
prediction = model.predict(example_df)
print(f"Prediction: {prediction}")

Prediction: [1]


In [29]:
filename = 'xgb_model_no_genes.pkl'
pickle.dump(model, open(filename, 'wb'))

In [31]:
# Get the feature names from the XGBoost model
feature_names = model.get_booster().feature_names

# Filter the feature names to include only the dummy variables for the target variable
target_dummies = [name for name in feature_names if name.startswith('MIC_Interpretation')]

# Extract the interpretation labels from the dummy variable names
interpretation_labels = [name.split('_')[-1] for name in target_dummies]

# Print the interpretation labels
print("Interpretation Labels:", interpretation_labels)


Interpretation Labels: []


In [32]:
feature_names

['Species_Acinetobacter anitratus',
 'Species_Acinetobacter baumannii',
 'Species_Acinetobacter baylyi',
 'Species_Acinetobacter bereziniae',
 'Species_Acinetobacter calcoaceticus',
 'Species_Acinetobacter dijkshoorniae',
 'Species_Acinetobacter guillouiae',
 'Species_Acinetobacter haemolyticus',
 'Species_Acinetobacter johnsonii',
 'Species_Acinetobacter junii',
 'Species_Acinetobacter lwoffii',
 'Species_Acinetobacter nosocomialis',
 'Species_Acinetobacter parvus',
 'Species_Acinetobacter pitii',
 'Species_Acinetobacter radioresistens',
 'Species_Acinetobacter schindleri',
 'Species_Acinetobacter tjernbergiae',
 'Species_Acinetobacter towneri',
 'Species_Acinetobacter ursingii',
 'Species_Acinetobacter, non-speciated',
 'Species_Aeromonas caviae',
 'Species_Aeromonas hydrophila',
 'Species_Aeromonas spp',
 'Species_Aeromonas veronii',
 'Species_Anaerococcus hydrogenalis',
 'Species_Anaerococcus lactolyticus',
 'Species_Anaerococcus murdochii',
 'Species_Anaerococcus octavius',
 'Spec