# Momala Malaria dataset

### The following script analyses data from Momala

# Imports

In [1]:
from matplotlib import pyplot as plt
from collections import Counter
from random import sample
from pprint import pprint
from os.path import join
from os import listdir
import os
import json
import cv2

# How many images?

In [2]:
folder_1 = join('imageSamples', 'AMREF17')
folder_2 = join('imageSamples', 'MR4')

files_in_1 = listdir(folder_1)
files_in_2 = listdir(folder_2)

print('Number of images')
print('In AMREF17: {}'.format(len(files_in_1)))
print('In MR4: {}'.format(len(files_in_2)))
print('In total: {}'.format(len(files_in_1) + len(files_in_2)))

Number of images
In AMREF17: 100
In MR4: 50
In total: 150


# How many annotations?

In [3]:
json_path = join('dataFiles', 'docs_extract_v1.0.json')

with open(json_path) as json_file:
     data = json.load(json_file)

image_grouping_names = list()
for group in data:
    image_grouping_names.append(group)
    
number_of_annotation = 0
for group in image_grouping_names:
    fields = data[group][0]['fields']
    
    for field in fields: 
        labels = field['labels']
        # Image doesn't contain annotations.
        if not labels:
            continue
        # Image does contain annotations.
        else: 
            number_of_annotation += len(labels)
print('There are {} annotations.'.format(number_of_annotation))

There are 3260 annotations.


# Lets only keep images with annotations

In [4]:
urls = list()
image_detections = dict()

for group in image_grouping_names:
    fields = data[group][0]['fields']
    
    for field in fields: 
        labels = field['labels']
        if not labels:
            continue
            
        else: 
            url = field['url']
            if url not in image_detections:
                image_detections[url] = list()
            
            for detection in labels:
                category = detection['type']
                coordinates = detection['coordinates']
                x_min = int(coordinates['left'])
                y_min = int(coordinates['top'])
                x_max = int(coordinates['right'])
                y_max = int(coordinates['bottom'])
                
                entry = (x_min, y_min, x_max, y_max, category)
                image_detections[url].append(entry)
                
print(image_detections)

{'MR4/HFAPER_pfrpaLBgz7Tqu5ayz_92.jpg': [(1354, 1049, 1369, 1064, 'white_blood_cell'), (428, 2180, 443, 2195, 'white_blood_cell'), (831, 2256, 846, 2271, 'white_blood_cell'), (926, 2859, 941, 2874, 'white_blood_cell'), (1545, 2814, 1560, 2829, 'white_blood_cell'), (1912, 2477, 1927, 2492, 'white_blood_cell'), (2525, 1853, 2540, 1868, 'white_blood_cell'), (1821, 2914, 1836, 2929, 'white_blood_cell'), (1479, 3146, 1494, 3161, 'white_blood_cell')], 'MR4/HFAPER_pfrpaLBgz7Tqu5ayz_91.jpg': [(1806, 1199, 1821, 1214, 'white_blood_cell'), (1912, 1375, 1927, 1390, 'white_blood_cell'), (826, 1999, 841, 2014, 'white_blood_cell'), (629, 2859, 644, 2874, 'white_blood_cell'), (871, 3005, 886, 3020, 'white_blood_cell'), (1515, 2814, 1530, 2829, 'white_blood_cell'), (1952, 2517, 1967, 2532, 'white_blood_cell'), (1635, 2965, 1650, 2980, 'white_blood_cell'), (2138, 2945, 2153, 2960, 'white_blood_cell')], 'MR4/HFAPER_pfrpaLBgz7Tqu5ayz_100.jpg': [(1555, 2228, 1570, 2243, 'white_blood_cell'), (2425, 2499, 2

# Show the first 3 images

In [5]:
# Make the plots bigger, to clearly see image
plt.rcParams["figure.figsize"] = (12,9)

# A helper function, to change BGR color 
# encoding that cv2 uses, to the RGB color 
# encoding matplotlib uses.
def convert_color(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

In [6]:
for idx, image_url in enumerate(image_detections): 
    
    # Break after 10 images
    if idx >= 3:
        break
         
    # Make it windows compatible
    directory, file_name = image_url.split('/')
    image_path = join('imageSamples', directory, file_name)
    print(image_path)
    image = cv2.imread(image_path)
    image = convert_color(image)
    red   = (0, 0, 255)
    green = (0, 255, 0)
    thickness = 10
    color = green
    
    for detection in image_detections[image_url]:
        (x_min, y_min, x_max, y_max, category) = detection  
        image = cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, thickness)
        
    plt.imshow(image)
    plt.show()
    

imageSamples/MR4/HFAPER_pfrpaLBgz7Tqu5ayz_92.jpg


error: OpenCV(3.4.4) /io/opencv/modules/imgproc/src/color.cpp:181: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


# What else, aside from white blood cells? 

In [None]:
categories = set()
for image_url in image_detections:
    for detection in image_detections[image_url]:
        category = detection[4]
        categories.add(category)

print('The categories present are :', categories)

The categories refer to:
- Plasmodium malariae
- Plasmodium falciparum
- Plasmodium ovale

# How many images with plasmodium?

In [None]:
images_with_plasmodium_count = 0 
images_with_plasmodium = list()

plasmodium_types = ['malariae', 'falciparum', 'ovale']

for image_url in image_detections:
    for detection in image_detections[image_url]:
        category = detection[4]
        if category in plasmodium_types:
            images_with_plasmodium_count += 1
            images_with_plasmodium.append(image_url)
            break 
            
print('There are {} images with plasmodium'.format(images_with_plasmodium_count))

# How many plasmodium?

In [None]:
plasmodium_counter = Counter()

for image_url in image_detections:
    for detection in image_detections[image_url]:
        category = detection[4]
        plasmodium_counter[category] += 1
        
print(plasmodium_counter)

total_plasmodium = 0 
for plasmodium_type in plasmodium_types:
    total_plasmodium += plasmodium_counter[plasmodium_type]

print('\nWhich is a total of {} plasmodium'.format(total_plasmodium))

# Show 3 plasmodium images

In [None]:
nr_to_show = 3

for image_url in images_with_plasmodium[:nr_to_show]:
    directory, file_name = image_url.split('/')
    image_path = join('Momala_data', directory, file_name)
    image = cv2.imread(image_path)
    image = convert_color(image)
    for detection in image_detections[image_url]:
        (x_min, y_min, x_max, y_max, category) = detection
        if category not in ['white_blood_cell', 'debris']:        
            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, thickness)
    plt.imshow(image)
    plt.show()

# What's in the green box? 

In [None]:
extracted_images = dict()

for plasmo in plasmodium_types:
    extracted_images[plasmo] = list()

for image_url in images_with_plasmodium:
    directory, file_name = image_url.split('/')
    image_path = join('Momala_data', directory, file_name)
    image = cv2.imread(image_path)
    image = convert_color(image)
    for detection in image_detections[image_url]:
        (x_min, y_min, x_max, y_max, category) = detection
        if category in plasmodium_types:
            extracted = image[y_min:y_max,x_min:x_max] # Extract the bounding box
            extracted_images[category].append(extracted)
            
fig, axis_array = plt.subplots(3, 5) 
fig.tight_layout()

print('First row: ',  plasmodium_types[0])
print('Second row: ', plasmodium_types[1])
print('Third row: ',  plasmodium_types[2])

for row_index, plasmodium_type in enumerate(plasmodium_types):
    for column_index in range(5):

        image_index = column_index
        image_to_plot = extracted_images[plasmodium_type][image_index]
        axis_array[row_index, column_index].set_axis_off()
        axis_array[row_index, column_index].imshow(image_to_plot)

# Save all plasmodium subimages

In [None]:
image_directory = 'plasmodium_subimages'
if not os.path.exists(image_directory):
    os.makedirs(image_directory)

for plasmodium_type in plasmodium_types:
    for idx, image in enumerate(extracted_images[plasmodium_type]):
        image = convert_color(image)
        image_name = '{}_{}.png'.format(plasmodium_type,idx)
        image_path = join(image_directory, image_name)
        cv2.imwrite(image_path, image)