### Importing the Libraries 

In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

### Loading the Dataset

In [6]:
dataset = pd.read_csv(r"C:\Users\DELL\Documents\Anemia_classification\Data\anemia_dataset.csv")

### Encoding the Categorical Data

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
dataset['Diagnosis'] = encoder.fit_transform(dataset['Diagnosis'])

### Features and Target variables

In [8]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:, -1].values

In [9]:
mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Mapping of categories to labels:", mapping)

Mapping of categories to labels: {'Healthy': 0, 'Iron deficiency anemia': 1, 'Leukemia': 2, 'Leukemia with thrombocytopenia': 3, 'Macrocytic anemia': 4, 'Normocytic hypochromic anemia': 5, 'Normocytic normochromic anemia': 6, 'Other microcytic anemia': 7, 'Thrombocytopenia': 8}


### Split the dataset into training and testing 

In [10]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 ,random_state = 2)

In [11]:
print(X_train.shape, X_test.shape)

(1024, 14) (257, 14)


In [12]:
print(y_train.shape)

(1024,)


### Standardization

In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## MODEL TRAINING 

### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

In [15]:
#Predictions 
y_pred = lr_model.predict(X_test)

In [16]:
#Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score:", accuracy)

Accuracy Score: 0.6575875486381323


In [17]:
# Confusion Matrix
print(confusion_matrix(y_test, y_pred))
print("Confusion Matrix:", confusion_matrix)

[[50  0  1  0  0  0  1  2  1]
 [ 0 42  1  0  0  0  0  3  0]
 [ 1  0  6  0  0  1  0  0  0]
 [ 0  0  1  1  0  0  0  1  0]
 [ 0  0  0  0  2  0  0  0  0]
 [ 9 12  0  0  1 28  2  6  2]
 [30  1  1  0  0  1 25  1  1]
 [ 0  5  0  0  0  0  0  3  0]
 [ 3  0  0  0  0  0  0  0 12]]
Confusion Matrix: <function confusion_matrix at 0x00000204124A5B20>


In [18]:
# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.91      0.68        55
           1       0.70      0.91      0.79        46
           2       0.60      0.75      0.67         8
           3       1.00      0.33      0.50         3
           4       0.67      1.00      0.80         2
           5       0.93      0.47      0.62        60
           6       0.89      0.42      0.57        60
           7       0.19      0.38      0.25         8
           8       0.75      0.80      0.77        15

    accuracy                           0.66       257
   macro avg       0.70      0.66      0.63       257
weighted avg       0.75      0.66      0.65       257



In [19]:
input_data = (9.3, 27.4, 64, 2.6, 5.9, 4.39, 12, 37.9, 86.4, 27.3, 31.6, 194, 15.9, 0.19)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = lr_model.predict(std_data)
print(prediction)

label_mapping = {
    0: 'Healthy',
    1: 'Iron deficiency anemia',
    2: 'Leukemia',
    3: 'Leukemia with thrombocytopenia',
    4: 'Macrocytic anemia',
    5: 'Normocytic hypochromic anemia',
    6: 'Normocytic normochromic anemia',
    7: 'Other microcytic anemia',
    8: 'Thrombocytopenia'
}

# Print the final prediction using the label mapping
print(f'The prediction is: {label_mapping[prediction[0]]}')

[[ 0.65031413  0.28414261 -1.00061539  0.80500563  0.51089761 -0.10042255
  -0.05361555 -0.16080061  0.14303515 -0.06117622 -0.07727654 -0.35318031
   0.90691118 -0.12237604]]
[5]
The prediction is: Normocytic hypochromic anemia


### Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(X_train, y_train)

In [21]:
rf_y_pred = rf_model.predict(X_test)

In [22]:
print(accuracy_score(y_test, rf_y_pred))

0.6614785992217899


### Support vector Classifier

In [23]:
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)

In [24]:
svc_pred = svc_model.predict(X_test)

In [25]:
print(accuracy_score(y_test, svc_pred))

0.8171206225680934


In [28]:
# Predcitive system 
import numpy as np
input_data = (8.3,20,70.3,1.7,5.8,5.59,10,34.7,62.1, 17.8, 28.8,203,12.8,0.17)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = svc_model.predict(std_data)
print(prediction)

label_mapping = {
    0: 'Healthy',
    1: 'Iron deficiency anemia',
    2: 'Leukemia',
    3: 'Leukemia with thrombocytopenia',
    4: 'Macrocytic anemia',
    5: 'Normocytic hypochromic anemia',
    6: 'Normocytic normochromic anemia',
    7: 'Other microcytic anemia',
    8: 'Thrombocytopenia'
}

# Print the final prediction using the label mapping
print(f'The prediction is: {label_mapping[prediction[0]]}')

[[ 0.27407152 -0.78132215 -0.32647857 -0.14984784  0.44841612  0.12039966
  -0.788506   -0.23357701 -2.89322872 -0.11055201 -0.67838123 -0.24867252
  -0.789386   -0.14225403]]
[1]
The prediction is: Iron deficiency anemia


### Saving the Model

In [27]:
import joblib

joblib.dump(svc_model,'anemia_classification.pkl')

['anemia_classification.pkl']