In [5]:
import re
import pandas as pd
import numpy as np
import os
import glob

import util

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Boundinx box from predictions
Creates a consolidated dataframe of bounding box co-ordinates for predicted cells in each image.

In [30]:
def pred_to_bbd(result_dir):

  # blank dataframe to store data
  annotations=pd.DataFrame()
  # Extract all file paths from result directory
  result_paths= sorted(glob.glob(os.path.join(result_dir,'*.txt')))

  # Extract bounding box locations from each result
  for paths in result_paths:
    filename= paths.split('/')[-1]
    # Read text file line by line
    with open(paths) as f:
      lines = f.readlines()
    # Extract bounding box co-ordinates for each cell type
    RBC= [x for x in lines if x.startswith('RBC')]
    WBC= [x for x in lines if x.startswith('WBC')]
    PLT= [x for x in lines if x.startswith('Platelets')]

    # Add cell type and image id
    RBC_bbd=[re.findall('\d+', e) for e in  RBC ]
    RBC_bbd = [e+['RBC']+ [filename] for e in RBC_bbd]

    WBC_bbd=[re.findall('\d+', e) for e in  WBC ]
    WBC_bbd = [e+['WBC']+ [filename]for e in WBC_bbd]

    PLT_bbd=[re.findall('\d+', e) for e in  PLT ]
    PLT_bbd = [e+['PLT']+ [filename] for e in PLT_bbd]

    bbd= RBC_bbd + WBC_bbd + PLT_bbd
    df1= pd.DataFrame(bbd)
    annotations=pd.concat([annotations,df1], ignore_index=True)

  # Column names
  annotations.columns=['conf','xmin', 'ymin', 'w','h','cell_type','filename']
  # Convert to numeric 
  num_columns=['conf','xmin', 'ymin', 'w','h']
  annotations[num_columns] = annotations[num_columns].apply(pd.to_numeric, errors='coerce', axis=1)

  return annotations

In [31]:
result_dir='/content/drive/MyDrive/UpGrad/LJMU_MS/Data/YOLO_00/YOLO/Validation/Results'
validation_result=pred_to_bbd(result_dir)

## Recognition Accuracy
Recognition accuracy is a simple but effective way to measure the accuracy of the model.
$$RA_k = 1- \frac{|N_{truth}^{k} - N_{pred}^{k}|}{N_{truth}^{k}}$$
$N_{truth}^{k}$ is ground truth of total number of objects of type k , and total number of objects of type k as predicted by the model $N_{pred}^{k}$.
Here k is indicator of cell type (e.g. RBC, WBC, Platelets)

In [150]:
# GT_df: Data frame of ground truth bounding boxes
# PRED_df : Data frame of predicted bounding boxes

def RA(GT_df, PRED_df):
  # Calculate number of cells in each category ground truth annotations (order is: RBC, Platelets, WBC) across all the images
  # N_truth for each category
  true=list(GT_df['cell_type'].value_counts())

  # Create spaces to store RBC, WBC , Platelates count for each different threshold 
  RBC_summary=[]
  WBC_summary=[]
  PLT_summary=[]
  # For loop for each threshold 0.05 to 0.95 by 0.05
  for thresh in list(range(5,100,5)):
    # RBC - N_pred for each category for a threhold
    df_RBC = PRED_df[(PRED_df['conf'] >= thresh) & (PRED_df['cell_type']=='RBC')]
    lRBC = list(df_RBC['cell_type'].value_counts())
    # Replace blank , i.e no cell as 0
    if len(lRBC)==0:
      lRBC=[0]

    #WBC - N_pred for each category for a threhold
    df_WBC = PRED_df[(PRED_df['conf'] >= thresh) & (PRED_df['cell_type']=='WBC')]
    lWBC = list(df_WBC['cell_type'].value_counts())
    if len(lWBC)==0:
      lWBC=[0]

    #Platelets - N_pred for each category for a threhold
    df_PLT = PRED_df[(PRED_df['conf'] >= thresh) & (PRED_df['cell_type']=='PLT')]
    lPLT = list(df_PLT['cell_type'].value_counts())
    if len(lPLT)==0:
      lPLT=[0]
    #Append the results
    RBC_summary.append(lRBC[0])
    WBC_summary.append(lWBC[0])
    PLT_summary.append(lPLT[0])

  # Calculate relative accuracy for each threshold
  RA_RBC= [(1-abs(x-true[0])/true[0]) for x in RBC_summary]
  RA_WBC= [(1-abs(x-true[1])/true[1]) for x in WBC_summary]
  RA_PLT= [(1-abs(x-true[2])/true[2]) for x in PLT_summary]

  threshold = list(range(5,100,5))
  #Store to a data frame
  RA = pd.DataFrame(np.column_stack([threshold,RA_RBC, RA_WBC, RA_PLT]),columns=['Conf', 'RBC','WBC','PLT'])

  return RA

Implementation of recognition accuracy

In [None]:
annotation_path='/content/drive/MyDrive/UpGrad/LJMU_MS/Data/YOLO_00/YOLO/Validation/Annotations'
validation_true,_=util.load_data(annotation_path)

In [148]:
PRED_df = validation_result
GT_df = validation_true

In [151]:
RA(GT_df, PRED_df)

Unnamed: 0,Conf,RBC,WBC,PLT
0,5.0,0.604534,0.956522,0.095238
1,10.0,0.801008,0.956522,0.619048
2,15.0,0.914358,0.956522,0.619048
3,20.0,0.994962,0.913043,0.857143
4,25.0,0.921914,0.913043,1.0
5,30.0,0.828715,0.913043,0.761905
6,35.0,0.758186,0.913043,0.714286
7,40.0,0.675063,0.869565,0.47619
8,45.0,0.607053,0.869565,0.380952
9,50.0,0.536524,0.826087,0.333333


## Average Absolute Error (AAE)
It is often used as measure of correctness of choice of IoU or confoidence threshold values.
$$AAE_k = \frac{1}{N} \sum_{i=1}^{n} |N_{truth,i}^{k} - N_{pred,i}^{k}|$$
Here k is the category (e.g. RBC, WBC, Platelet) of an object, and N is the total number of images in the test set. Error in counting object of type k in the i-th image is the absolute difference actual number of cells ($N_{truth,i}^{k}$) and the predicted value ($𝑁_{pred,i}^{k}$ ). Lower the value, better the choice of threshold.

In [111]:
# Calculate N
nimg = len(glob.glob(os.path.join(annotation_path,'*.xml')))

In [74]:
# Prepare ground truth data frame
validation_true['filename']=validation_true['filename'].apply(lambda x: x.split('/')[-1]).str.replace('jpg','txt')
validation_true['cell_type']=validation_true['cell_type'].str.replace('Platelets','PLT')

In [160]:
# GT_df : Ground truth annotations as a dataframe
# PRED_df : Predicted annotations in a dataframe
# category : Type of cell
# nimg: Number of images in the directory (e.g: Train/test/validation)

def AAE(GT_df, PRED_df, category, nimg=21):
  #Calculate number of cells of for a category (RBC, WBC, Platelets) in ground truth annotations for each image
  # for each category calculate N_true_i
  x2= GT_df[(GT_df['cell_type']==category)].groupby(['filename'])['cell_type'].count()
  df2= pd.DataFrame(x2).reset_index()
  df2= df2.rename({'cell_type':'Truth'},axis=1)
  # Store results 
  result=[]

  for thresh in list(range(5,100,5)):
    # Calculate number of cells of for a cell_type (RBC, WBC, Platelets) in predicted annotations for each image for a confidence threshold
    #for each category calculate N_pred_i
    x1= PRED_df[(PRED_df['conf'] >= thresh) & (PRED_df['cell_type']==category)].groupby(['filename'])['cell_type'].count()
    df1= pd.DataFrame(x1).reset_index()
    df1= df1.rename({'cell_type':'Pred'},axis=1)

    # Merge predicted and ground truth cell count by image
    df3 = df1.merge(df2,how='outer',on='filename').fillna(0)
    #Calculate AAE for a category for each image |N_true_i - N_pred_i|
    df3['AAE']= abs(df3['Pred'] - df3['Truth'])
    # For all the image : i.e AAE for a category at each threshold 
    r=sum(df3['AAE'])/nimg
    result.append(r)

  return result

Implementation of AAE across different thresholds

In [164]:
AAE_WBC= AAE(GT_df, PRED_df,category='WBC',nimg=21)
AAE_RBC= AAE(GT_df, PRED_df,category='RBC',nimg=21)
AAE_PLT= AAE(GT_df, PRED_df,category='PLT',nimg=21)
thershold= list(np.arange(0.05,1,0.05)) 
pd.DataFrame(np.column_stack([threshold,AAE_RBC, AAE_WBC, AAE_PLT]),columns=['Conf', 'RBC','WBC','PLT'])

Unnamed: 0,Conf,RBC,WBC,PLT
0,5.0,7.47619,0.047619,0.809524
1,10.0,3.857143,0.047619,0.380952
2,15.0,2.761905,0.047619,0.380952
3,20.0,2.857143,0.0,0.238095
4,25.0,3.095238,0.0,0.190476
5,30.0,4.095238,0.0,0.333333
6,35.0,4.952381,0.0,0.380952
7,40.0,6.238095,0.047619,0.619048
8,45.0,7.428571,0.047619,0.714286
9,50.0,8.761905,0.095238,0.761905
