## Import Packages and datasets


In [10]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline


from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier


from sklearn.tree import export_graphviz
import graphviz
import numpy as np



In [11]:
import joblib

In [12]:

 
data = pd.read_csv('amr_without_genes_ml.csv')

In [14]:
data

Unnamed: 0,Species,Country,Speciality,Source,Year,Phenotype,Antibiotics,MIC_Interpretation
0,Pseudomonas aeruginosa,France,Emergency Room,Urine,2013,,Levofloxacin,Resistant
1,Pseudomonas aeruginosa,France,Emergency Room,Urine,2013,,Meropenem,Susceptible
2,Pseudomonas aeruginosa,France,Emergency Room,Urine,2013,,Piperacillin.tazobactam,Resistant
3,Pseudomonas aeruginosa,France,Emergency Room,Ear,2013,,Cefepime,Intermediate
4,Pseudomonas aeruginosa,France,Emergency Room,Ear,2013,,Levofloxacin,Resistant
...,...,...,...,...,...,...,...,...
1048570,Enterococcus faecalis,United States,Medicine General,Blood,2018,,Ampicillin,Susceptible
1048571,Enterococcus faecalis,United States,Medicine General,Blood,2018,,Vancomycin,Susceptible
1048572,Enterococcus faecalis,United States,Medicine General,Blood,2018,,Teicoplanin,Susceptible
1048573,Staphylococcus aureus,United States,None Given,Cellulitis,2018,MSSA,Vancomycin,Susceptible


In [15]:
# Initialize SimpleImputer with 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')

In [16]:
# Perform Simple Imputation
imputed_data = imputer.fit_transform(data)

In [17]:
# Convert the imputed array back to DataFrame
data = pd.DataFrame(imputed_data, columns=data.columns)


In [18]:

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

In [19]:
data_encoded

Unnamed: 0,Species,Country,Speciality,Source,Year,Phenotype,Antibiotics,MIC_Interpretation
0,190,15,1,88,8,2,23,1
1,190,15,1,88,8,2,25,2
2,190,15,1,88,8,2,30,1
3,190,15,1,27,8,2,6,0
4,190,15,1,27,8,2,23,1
...,...,...,...,...,...,...,...,...
1048570,110,55,3,7,13,2,2,2
1048571,110,55,3,7,13,2,35,2
1048572,110,55,3,7,13,2,32,2
1048573,215,55,5,20,13,4,35,2


In [20]:
for column in data.columns:
    print(f"Column: {column}")
    print(data[column].unique())
    print()

Column: Species
['Pseudomonas aeruginosa' 'Serratia marcescens' 'Acinetobacter pitii'
 'Acinetobacter baumannii' 'Enterobacter cloacae' 'Escherichia coli'
 'Haemophilus influenzae' 'Staphylococcus aureus' 'Enterococcus faecium'
 'Enterococcus faecalis' 'Streptococcus agalactiae'
 'Klebsiella pneumoniae' 'Klebsiella aerogenes' 'Acinetobacter junii'
 'Klebsiella oxytoca' 'Enterobacter kobei' 'Streptococcus pneumoniae'
 'Acinetobacter, non-speciated' 'Acinetobacter lwoffii'
 'Serratia liquefaciens' 'Enterobacter asburiae' 'Citrobacter freundii'
 'Serratia fonticola' 'Serratia rubidaea' 'Acinetobacter schindleri'
 'Acinetobacter guillouiae' 'Clostridium perfringens'
 'Clostridioides difficile' 'Clostridium tertium' 'Clostridium butyricum'
 'Clostridium hathewayi' 'Clostridium barati' 'Bacteroides fragilis'
 'Parabacteroides distasonis' 'Bacteroides nordii' 'Prevotella denticola'
 'Bacteroides vulgatus' 'Bacteroides thetaiotaomicron'
 'Bacteroides uniformis' 'Prevotella buccae' 'Prevotella 

['Resistant' 'Susceptible' 'Intermediate']



In [21]:
for column in data_encoded.columns:
    print(f"Column: {column}")
    print(data_encoded[column].unique())
    print()


Column: Species
[190 208  13   1  98 117 121 215 111 110 236 130 125   9 127 102 250  19
  10 207  96  56 205 211  15   6  83  63  93  69  78  65  37 140  40 162
  48  46  47 159 173 158 150 134   4  11 116 154  14   8 107 221  18   7
 115 220 108 113 209 112 222 224 231 185  58 135 188 238 251 181 228 241
 123 242 237 243 253 233  20  53 235 103  12  16 132 213 204 106 128 152
 143  30 167  41  73 172  91 166  84  32  51  82  28  81 240 187 217  55
  97 198  57  59 131 122 256 212 248   2  95 195  60 160 151  43  39 170
 171 141  25 145 169  64 163  72 144  80  85 252 201 202 109 219  86  89
  87 104 230 210 124  71  24  31  42 156  38 165  26  92  45 146 177 175
 157 176 168  52  62 218 136 182 247 193 180 244 227 137 200  29  88 142
 229  70 216 186 249 100 118 189 203  50 254 225 214 206  17 255 234 191
 126 183 149  90  67 196  44  68  27 161  76  79  66  77 174  74 223 101
  49  34 129  33 164   0  94 239 226 192  21 139 245  61 153 120  23 194
  36 133 105 178 148  35 179  99 14

## Preprocess the data

In [None]:
data_encoded.head()

## Create the Train and Test datasets

In [None]:
# Separate features and target variable
X = data_encoded.drop(['MIC_Interpretation'], axis=1)  # Features
y = data_encoded['MIC_Interpretation']  # Target variable


In [None]:
X.head()

In [None]:
y.head()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train the model

In [None]:
# Train the  models
xgb = XGBClassifier()



In [None]:
xgb.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = xgb.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy_dct: %.2f%%" % (accuracy * 100.0))





In [None]:
# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision: %.2f%%" % (precision * 100.0))



In [None]:

for column in X.columns:
    print(column)



In [None]:
X.head(1000)

In [None]:
import pickle

In [None]:
filename = 'xgb_model_no_genes.pkl'
pickle.dump(xgb, open(filename, 'wb'))

In [None]:
label_encoder_filename = 'label_encoder.sav'
joblib.dump(label_encoder, label_encoder_filename)

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import joblib

# Load the XGBoost model and LabelEncoder
xgb_model_no_genes = joblib.load("xgb_model_no_genes.pkl")
label_encoder = joblib.load("label_encoder.sav")




In [25]:
# List of predictors
predictors = ["Acinetobacter baumannii", "Argentina",
    "Clinic / Office",
    "Abdominal Fluid",
    "ESBL",
    "Amikacin"]

# Convert the list to a DataFrame
predictors_df = pd.DataFrame(predictors, columns=["Predictor"])

# Print the DataFrame
print(predictors_df)

                 Predictor
0  Acinetobacter baumannii
1                Argentina
2          Clinic / Office
3          Abdominal Fluid
4                     ESBL
5                 Amikacin


In [None]:
# Convert the prediction back to the original label
prediction_label = label_encoder.inverse_transform(no_genes_prediction)


In [26]:
# Encode the predictors
prediction_encoded = predictors_df.apply(label_encoder.fit_transform)


In [27]:
prediction_encoded

Unnamed: 0,Predictor
0,1
1,3
2,4
3,0
4,5
5,2


In [30]:
# Convert the list to a NumPy array
predictors_array = np.array(predictors)

# Print the array
print(predictors_array)


['Acinetobacter baumannii' 'Argentina' 'Clinic / Office' 'Abdominal Fluid'
 'ESBL' 'Amikacin']


In [32]:
# Encode the predictors
prediction_encoded = np.vectorize(lambda x: label_encoder.transform([x])[0])(predictors_array)

# Print the encoded values
print(prediction_encoded)

[1 3 4 0 5 2]


In [33]:
# Predict using the encoded values
no_genes_prediction = xgb_model_no_genes.predict([prediction_encoded])



ValueError: training data did not have the following fields: Species, Country, Speciality, Source, Year, Phenotype, Antibiotics

In [None]:
# Convert the prediction back to the original label
prediction_label = label_encoder.inverse_transform(no_genes_prediction)

# Print the prediction
print("Prediction:", prediction_label)
