In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
iron=pd.read_csv('/content/anemia.csv')
iron.head()
print(iron.count())
iron.describe()
iron.info()

WBC          1281
LYMp         1281
NEUTp        1281
LYMn         1281
NEUTn        1281
RBC          1281
HGB          1281
HCT          1281
MCV          1281
MCH          1281
MCHC         1281
PLT          1281
PDW          1281
PCT          1281
Diagnosis    1281
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281 entries, 0 to 1280
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   WBC        1281 non-null   float64
 1   LYMp       1281 non-null   float64
 2   NEUTp      1281 non-null   float64
 3   LYMn       1281 non-null   float64
 4   NEUTn      1281 non-null   float64
 5   RBC        1281 non-null   float64
 6   HGB        1281 non-null   float64
 7   HCT        1281 non-null   float64
 8   MCV        1281 non-null   float64
 9   MCH        1281 non-null   float64
 10  MCHC       1281 non-null   float64
 11  PLT        1281 non-null   float64
 12  PDW        1281 non-null   float64
 13  PCT  

In [15]:
missing_values=iron.isnull().sum()
print(missing_values)

WBC          0
LYMp         0
NEUTp        0
LYMn         0
NEUTn        0
RBC          0
HGB          0
HCT          0
MCV          0
MCH          0
MCHC         0
PLT          0
PDW          0
PCT          0
Diagnosis    0
dtype: int64


In [16]:
for column in iron.columns:
  if pd.api.types.is_numeric_dtype(iron[column]):
    z_score=(iron[column]-iron[column].mean())/iron[column].std()
    outliers=iron[abs(z_score)>3]
    iron_filtered=iron.drop(outliers.index)
print(outliers.count())

WBC          4
LYMp         4
NEUTp        4
LYMn         4
NEUTn        4
RBC          4
HGB          4
HCT          4
MCV          4
MCH          4
MCHC         4
PLT          4
PDW          4
PCT          4
Diagnosis    4
dtype: int64


In [17]:
print(iron_filtered.count())

WBC          1277
LYMp         1277
NEUTp        1277
LYMn         1277
NEUTn        1277
RBC          1277
HGB          1277
HCT          1277
MCV          1277
MCH          1277
MCHC         1277
PLT          1277
PDW          1277
PCT          1277
Diagnosis    1277
dtype: int64


In [18]:
duplicate_count=iron_filtered.duplicated()
print('The duplicates in the data are:',duplicate_count.sum())

The duplicates in the data are: 49


In [19]:
iron_filtered=iron_filtered.drop_duplicates()
print(iron_filtered.count())

WBC          1228
LYMp         1228
NEUTp        1228
LYMn         1228
NEUTn        1228
RBC          1228
HGB          1228
HCT          1228
MCV          1228
MCH          1228
MCHC         1228
PLT          1228
PDW          1228
PCT          1228
Diagnosis    1228
dtype: int64


In [20]:
iron_filtered.to_csv('/content/anemia_filtered.csv')

In [23]:
X=iron_filtered.drop('Diagnosis',axis=1)
y=iron_filtered['Diagnosis']


# Now apply StandardScaler only to the features (X)
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Concatenate scaled features with the target variable
iron_filtered_scaled = pd.concat([X_scaled, y], axis=1)
iron_filtered_scaled.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [24]:
#Apply random forest model
random_forest = RandomForestClassifier(n_estimators=300, criterion='gini', max_depth=None, random_state=42)
model_fit=random_forest.fit(X_train, y_train)
#make predictions
y_pred = model_fit.predict(X_test)

In [25]:
# Training accuracy
training_accuracy = model_fit.score(X_train, y_train)
print(f"Training Accuracy: {training_accuracy:.2f}")


Training Accuracy: 1.00


In [26]:
print(classification_report(y_test,y_pred))

                                precision    recall  f1-score   support

                       Healthy       0.98      1.00      0.99        93
        Iron deficiency anemia       1.00      1.00      1.00        61
                      Leukemia       1.00      1.00      1.00         7
Leukemia with thrombocytopenia       0.60      1.00      0.75         3
             Macrocytic anemia       0.00      0.00      0.00         3
 Normocytic hypochromic anemia       0.97      0.99      0.98        88
Normocytic normochromic anemia       1.00      0.97      0.99        75
       Other microcytic anemia       1.00      1.00      1.00        17
              Thrombocytopenia       1.00      0.95      0.98        22

                      accuracy                           0.98       369
                     macro avg       0.84      0.88      0.85       369
                  weighted avg       0.98      0.98      0.98       369



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
important_features=random_forest.feature_importances_
print(important_features)

[0.06734645 0.00897653 0.01390997 0.00959447 0.02104669 0.09079506
 0.18960565 0.03956186 0.15977876 0.12719194 0.14003675 0.09210892
 0.00874922 0.03129772]


In [28]:
features=pd.DataFrame({'Feature':X.columns,'Importance':important_features})
features.sort_values(by='Importance',ascending=False)

Unnamed: 0,Feature,Importance
6,HGB,0.189606
8,MCV,0.159779
10,MCHC,0.140037
9,MCH,0.127192
11,PLT,0.092109
5,RBC,0.090795
0,WBC,0.067346
7,HCT,0.039562
13,PCT,0.031298
4,NEUTn,0.021047
