In [None]:
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

In [None]:
import os 

#first put a shortcut in your drive to the image processing folder

#Choose the image folder you want to try

pipeFolder = '8.Prepro+glcm+fixed'

RESULTS_FILES_DIR = os.path.join('/content',
                        'drive',
                        'MyDrive',
                        'Results', 
                        pipeFolder)


RESULTS_DIR = os.path.join('/content',
                        'drive',
                        'MyDrive',
                        'Results')


DATA_DIR = os.path.join('/content',
                        'drive',
                        'MyDrive',
                        'Image Processing and Analysis 2022',
                        'projects',
                        'Calcification Detection',
                        'dataset')

if not os.path.exists(os.path.join(RESULTS_DIR, pipeFolder+"+CorrectedLabels")):
  os.makedirs(os.path.join(RESULTS_DIR, pipeFolder+"+CorrectedLabels"))


results_file = os.listdir(RESULTS_FILES_DIR)
print(results_file)

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
#!pip install fastprogress
from fastprogress import master_bar, progress_bar
import gc
import time

In [None]:
from google.colab.patches import cv2_imshow
#import plt for display
import matplotlib.pyplot as plt

#go into de directory of the images

# this have 3 outputs root directory, the folders in the path and the files in the path.
# we ignore _ the two first because we are not interested in those
_, _, images = next(os.walk(os.path.join(DATA_DIR,'images')))
_, _, breastMasks = next(os.walk(os.path.join(DATA_DIR,'masks')))
_, _, groundTruths = next(os.walk(os.path.join(DATA_DIR, 'groundtruths')))

images.sort()
breastMasks.sort()
groundTruths.sort()

# read numbers of normal images
normals = []
with open(os.path.join(DATA_DIR,'normals.txt')) as f:
    for line in f:
        normals.append(line[:-1])

In [None]:
# function to get connected components of the ground truth binary image
import cv2

def componentsStatsGroundTruth(matrix):
  dict_stat = dict()

  # getting the info of the components in the ground truth
  # second value is connectivity 4 or 8
  connectedComponentsGroundTruth = cv2.connectedComponentsWithStats(matrix, 8, cv2.CV_32S)

  # Get the results
  # The first cell is the number of labels
  num_labels = connectedComponentsGroundTruth[0]
  # The second cell is the label matrix
  labels = connectedComponentsGroundTruth[1]
  
  
  # The third cell is the stat matrix

# if it is not register as normal
# stat have 5 items: leftmost x coordinate,
#                    topmost y coordinate,
#                    horizontal size of the bounding box
#                    vertical size of the bounding box
#                    total area in pixels of the connected component

  stats = connectedComponentsGroundTruth[2]
  for stat in range(len(stats)):
    dict_stat[stat] = {}
    dict_stat[stat]["leftmost_x"] = stats[stat][0]
    dict_stat[stat]["topmost_y"] = stats[stat][1]
    dict_stat[stat]["hor_size"] = stats[stat][2]
    dict_stat[stat]["vert_size"] = stats[stat][3]
    dict_stat[stat]["total_ares"] = stats[stat][4]

  # The fourth cell is the centroid matrix
  dict_centroid = dict()
  centroids = connectedComponentsGroundTruth[3]
  for i in range(len(centroids)):
    dict_centroid[i] = {}
    dict_centroid[i]["x"] = centroids[i][0]
    dict_centroid[i]["y"] = centroids[i][1]

  return num_labels, labels, dict_stat, dict_centroid

In [None]:
import pandas as pd
import gc
import json
from skimage.measure import label, regionprops

def create_groundTruthStatsFile(groundTruths):

  gTComponents = pd.DataFrame()
  flag = True

  for groundTruth in groundTruths:

    gTComponents = pd.DataFrame()

    imgGroundTruth = cv2.imread(os.path.join(DATA_DIR, 'groundtruths', groundTruth), cv2.IMREAD_GRAYSCALE)

    blobs = imgGroundTruth > 0.7 * imgGroundTruth.mean() #Thresholding the backgroudnd
    blobs_labels, count = label(blobs, background=0, return_num=True) #Getting labels of the connected components and the amount of them without considering the count


    # https://scikit-image.org/docs/stable/api/skimage.measure.html#skimage.measure.regionprops   
    # regionprops give us back a lot of properties including area and centroids
    props = regionprops(blobs_labels)
    centroids = []
    areas = []
    for prop in props:
      centroids.append(prop.centroid)
      areas.append(prop.area)

    gTComponents['centroids'] = [centroids]
    gTComponents['area'] = [areas]
    gTComponents['num_labels'] = count
    gTComponents['name'] = groundTruth

    if(flag):
      gTComponents.to_csv(os.path.join('/content',
                                  'drive',
                                  'MyDrive',
                                  'Results',
                                  'groundTruthStatsFinal.csv'),
                      mode='a',
                      index=False)
      flag = False
    else:
      gTComponents.to_csv(os.path.join('/content',
                                  'drive',
                                  'MyDrive',
                                  'Results',
                                  'groundTruthStatsFinal.csv'),
                    mode='a',
                    header=False,
                    index=False)
    
    del areas, centroids, props
    gc.collect()
  
  return


In [None]:
import pandas as pd
import gc
import json
from skimage.measure import label, regionprops


path = RESULTS_FILES_DIR

for result in progress_bar(results_file):
  try:
    df = pd.read_csv(path+'/'+result)
    print(len(df))
  except:
    print("Empty file ", result)

  df_positives = df.loc[df.label == 1]
  print('positives', len(df_positives))
  df_new = pd.DataFrame()
  if 'glcm' in pipeFolder:
    df_step = 1
  else:
    df_step = 7
  for i in range(0, len(df_positives), df_step):
    top = int(df_positives.iloc[i]['y'] -7)
    bottom = int(df_positives.iloc[i]['y'] +7)
    left = int(df_positives.iloc[i]['x'] -7)
    right = int(df_positives.iloc[i]['x'] +7)
    
    imgGroundTruth = cv2.imread(os.path.join(DATA_DIR, 'groundtruths', result.split('.')[0][: 38]+'.tif'), cv2.IMREAD_GRAYSCALE)
    blobs = imgGroundTruth > 0.7 * imgGroundTruth.mean() #Thresholding the background
    blobs_labels, count = label(blobs, background=0, return_num=True) #Getting labels of the connected components and the amount of them without considering the background
    props = regionprops(blobs_labels)
    
    roi = blobs_labels[top:bottom, left:right]
    nonzero = cv2.countNonZero(roi)

    if nonzero > 0: # unnecessary if label is 1 it will always be true
      # https://scikit-image.org/docs/stable/api/skimage.measure.html#skimage.measure.regionprops
      count = 0
      matching_labels = [i for i in np.unique(roi) if i != 0]# not take into account background
      
      for match in matching_labels:
        if props[match-1].area > np.floor(np.pi*(15/2.0)**2):
          count+=1
      if count == len(matching_labels):
        # all matches correspond to big calcifications 
        print('Area too big... ignored')
      else:
        # match != count, if at less one is not big. it will keep the candidate
        df_new = df_new.append(df_positives.iloc[ i:i+df_step,:])
    else:
      print("NONZERO ALERT YOU SHOULD NOT BE READING THIS")
  
  df_final = df.loc[df.label == 0].append(df_new)
  print(len(df_final))

  df_final.to_csv(os.path.join(RESULTS_DIR, pipeFolder+"+CorrectedLabels", result), index=False)
