In [1]:
import os
import numpy as np
import cv2
import pandas as pd
import psutil
import time, datetime
import pickle
import opendatasets as od

import sklearn
from imutils import paths
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import auc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


In [None]:
#to download the dataset from kaggle
#od.download('https://www.kaggle.com/datasets/iarunava/cell-images-for-detecting-malaria')

In [3]:

class SimpleDatasetLoader:
    # Method: Constructor
    def __init__(self, preprocessors=None):
        """
        :param preprocessors: List of image preprocessors
        """
        self.preprocessors = preprocessors

        if self.preprocessors is None:
            self.preprocessors = []

    # Method: Used to load a list of images for pre-processing
    def load(self, image_paths, verbose=-1):
        """
        :param image_paths: List of image paths
        :param verbose: Parameter for printing information to console
        :return: Tuple of data and labels
        """
        data, labels = [], []

        for i, image_path in enumerate(image_paths):
            image = cv2.imread(image_path)
            label = image_path.split(os.path.sep)[-2]

            if self.preprocessors is not None:
                for p in self.preprocessors:
                    image = p.preprocess(image)

            data.append(image)
            labels.append(label)

            if verbose > 0 and i > 0 and (i+1) % verbose == 0:
                print('[INFO]: Processed {}/{}'.format(i+1, len(image_paths)))

        return (np.array(data), np.array(labels))

In [4]:
#Class Preprocessror 
class SimplePreprocessor:
    # Method: Constructor
    def __init__(self, width, height, interpolation=cv2.INTER_AREA):
        """
        :param width: Image width
        :param height: Image height
        :param interpolation: Interpolation algorithm
        """
        self.width = width
        self.height = height
        self.interpolation = interpolation

    # Method: Used to resize the image to a fixed size (ignoring the aspect ratio)
    def preprocess(self, image):
        """
        :param image: Image
        :return: Re-sized image
        """
        return cv2.resize(image, (self.width, self.height), interpolation=self.interpolation)

In [5]:
from __main__ import SimplePreprocessor
from __main__ import SimpleDatasetLoader

In [6]:
# Get list of image paths
image_paths = list(paths.list_images(r".\cell-images-for-detecting-malaria\cell_images\cell_images"))

# Initialize SimplePreprocessor and SimpleDatasetLoader and load data and labels
print('[INFO]: Images loading....')
sp = SimplePreprocessor(32, 32)
sdl = SimpleDatasetLoader(preprocessors=[sp])
(data, labels) = sdl.load(image_paths, verbose=13779)

[INFO]: Images loading....
[INFO]: Processed 13779/27558
[INFO]: Processed 27558/27558


In [7]:
print(data.shape)
print(labels.shape)

(27558, 32, 32, 3)
(27558,)


In [8]:
# Reshape 
data = data.reshape((data.shape[0], 3072))

In [9]:
print('Min: {}, Max: {}'.format(data.min(), data.max()))
Max = float(data.max())
data = data / Max

Min: 0, Max: 254


In [10]:
data, x_test, labels, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

In [11]:
le = LabelEncoder()
labels = le.fit_transform(labels)

In [12]:
rf = RandomForestClassifier()

In [13]:
start = datetime.datetime.now()
time.sleep(10)

rf.fit(data, labels)
print(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)

end = datetime.datetime.now()
diff = (end - start)
print(f"RF Train Time: {diff}")

833.875
RF Train Time: 0:02:18.981265


In [14]:
start = datetime.datetime.now()
time.sleep(10)

preds = rf.predict(x_test)
print(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)

end = datetime.datetime.now()
diff = (end - start)
print(f"RF Inference Time: {diff}")

833.921875
RF Inference Time: 0:00:10.649278


In [15]:
y_test = le.fit_transform(y_test)

In [16]:
print(classification_report(y_test, preds))
rf_accuracy = metrics.accuracy_score(y_test, preds)
rf_auc = metrics.roc_auc_score(y_test, preds)
rf_mcc = metrics.matthews_corrcoef(y_test, preds)
rf_spec =recall_score(y_test, preds,pos_label=0)
print("Accuracy score: {}".format(rf_accuracy))
print("auc: {}".format(rf_auc))
print("mcc: {}".format(rf_mcc))
print("Specificity: {}".format(rf_spec))

              precision    recall  f1-score   support

           0       0.79      0.83      0.81      4146
           1       0.82      0.78      0.80      4122

    accuracy                           0.80      8268
   macro avg       0.80      0.80      0.80      8268
weighted avg       0.80      0.80      0.80      8268

Accuracy score: 0.8028543783260764
auc: 0.8027853085803401
mcc: 0.6063067290018137
Specificity: 0.826579835986493


In [17]:
pickle.dump(rf, open("rf_Cell_images.pkl", "wb"))