In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import shutil
import tensorflow as tf
import cv2

In [None]:
processedImage1 = "./data_source/1/preprocessed_images"
baseImage1 = "./data_source/1/image_data"
baseImage2 = "./data_source/2/dataset"
train_dir = "./datasets/train"
validation_dir = "./datasets/validation"
cataractTrain_dir = os.path.join(train_dir + "/", 'cataract')
cataractVal_dir = os.path.join(validation_dir + "/", 'cataract')
normalTrain_dir = os.path.join(train_dir + "/", 'normal')
normalVal_dir = os.path.join(validation_dir + "/", 'normal')
baseCataract2 = os.path.join(baseImage2 + "/", '2_cataract')
baseNormal2 = os.path.join(baseImage2 + "/", '1_normal')

# Filtering dataset 1

In [None]:
df = pd.read_csv("./data_source/1/full_df.csv")
df.head()

In [None]:
def has_cataract(text):
    if "cataract" in text:
        return 1
    else:
        return 0

In [None]:
df["left_cataract"] = df["Left-Diagnostic Keywords"].apply(lambda x: has_cataract(x))
df["right_cataract"] = df["Right-Diagnostic Keywords"].apply(lambda x: has_cataract(x))

In [None]:
df["right_cataract"][1:5]
df.head()

In [None]:
left_cataract = df.loc[(df.C ==1) & (df.left_cataract == 1)]["Left-Fundus"].values

In [None]:
right_cataract = df.loc[(df.C ==1) & (df.right_cataract == 1)]["Right-Fundus"].values
right_cataract[:15]

In [None]:
print("Number of images in left cataract: {}".format(len(left_cataract)))
print("Number of images in right cataract: {}".format(len(right_cataract)))

In [None]:
left_normal = df.loc[(df.C ==0) & (df["Left-Diagnostic Keywords"] == "normal fundus")]["Left-Fundus"].sample(250,random_state=42).values
right_normal = df.loc[(df.C ==0) & (df["Right-Diagnostic Keywords"] == "normal fundus")]["Right-Fundus"].sample(250,random_state=42).values
right_normal[:15]


In [None]:
cataract = np.concatenate((left_cataract,right_cataract),axis=0)
normal = np.concatenate((left_normal,right_normal),axis=0)

In [None]:
# print(cataract)
# print(normal)
print(left_cataract)
print(right_cataract)

In [None]:
train_cataract = cataract[:494]
val_cataract = cataract[494:]
train_normal = normal[:400]
val_normal = normal[400:]
# print(train_cataract)
# print(val_cataract)
# print(train_normal)
# print(val_normal)

In [None]:
print(len(cataract),len(normal))
print(len(train_cataract),len(val_cataract))
print(len(train_normal),len(val_normal))

In [None]:
def move_image(itemList, destinationDirectory, sourceDirectory):
    for image in os.listdir(sourceDirectory):
        if image in itemList:
            name = os.path.join(sourceDirectory + "/", image)
            if os.path.isfile(name):
                shutil.copy(name, destinationDirectory)
            else:
                print ("file does not exist", name)

In [None]:
move_image(train_cataract, cataractTrain_dir, processedImage1)
move_image(val_cataract, cataractVal_dir, processedImage1)
move_image(train_normal, normalTrain_dir, processedImage1)
move_image(val_normal, normalVal_dir, processedImage1)

In [None]:
tc = 0
vc = 0
tn = 0
vn = 0

for item in os.listdir(cataractTrain_dir):
    tc += 1
for item in os.listdir(cataractVal_dir):
    vc += 1
for item in os.listdir(normalTrain_dir):
    tn += 1
for item in os.listdir(normalVal_dir):
    vn += 1
    
# 400 from 594
print(tc) # 301 from 494 (miss 193)
print(vc) # 99 from 100 (miss 1)
# 484 dari 500
print(tn) # 387 from 400 (miss 13)
print(vn) # 97 from 100 (miss 3)

In [None]:
processed_trainCataract = os.listdir(cataractTrain_dir)
processed_valCataract = os.listdir(cataractVal_dir)
processed_trainNormal = os.listdir(normalTrain_dir)
processed_valNormal = os.listdir(normalVal_dir)

missing_trainCataract = []
missing_valCataract = []
missing_trainNormal = []
missing_valNormal = []

In [None]:
def missing_image(itemList, missingList, processedList):
    for item in itemList:
        if item not in processedList:
            missingList.append(item)

In [None]:
missing_image(train_cataract, missing_trainCataract, processed_trainCataract)
missing_image(val_cataract, missing_valCataract, processed_valCataract)
missing_image(train_normal, missing_trainNormal, processed_trainNormal)
missing_image(val_normal, missing_valNormal, processed_valNormal)

In [None]:
print(missing_trainCataract)
print(missing_valCataract)
print(missing_trainNormal)
print(missing_valNormal)
# there are some missing image in train and validation for cataract

In [None]:
seen = {}
duplicate = []
for item in train_cataract:
    if item not in seen:
        seen[item] = 1
    else:
        if seen[item] == 1:
            duplicate.append(item)
        seen[item] += 1

print(train_cataract)
print(seen)
print(len(duplicate))
        
# after examining the list, there are duplicate within the list. that is why the image that we get is less than what expected
# the duplicate itself come from the csv file that was included in the dataset
# after id 4784 the csv in the dataset repeat its content duplicating its value
# and there are actual missing data in the preprocessed_image folder

In [None]:
def crop_image(itemList, destinationDirectory, sourceDirectory):
    for image in os.listdir(sourceDirectory):
        if image in itemList:
            image_path = os.path.join(sourceDirectory + "/", image)
            image_temp = cv2.imread(image_path)
            cropped_image = image_temp[0:,600:2900]
            cropped_path = os.path.join(destinationDirectory + "/", image)
            cv2.imwrite(cropped_path, cropped_image)

In [None]:
# Tried to cropped all of them at once 
# after checking result
# only works for '2244_left.jpg', '2251_left.jpg' on cataract train
# the picture have various pixel value
crop_image(missing_trainCataract, cataractTrain_dir, baseImage1)
crop_image(missing_valCataract, cataractVal_dir, baseImage1)

In [None]:
# manually cropped ['2102_left.jpg', '2231_left.jpg', '1456_right.jpg']
# ['2229_right.jpg']

In [None]:
def cropped_path(destination_path ,image):
    return os.path.join(destination_path + "/", image)

In [None]:
# 2102_left.jpg

In [None]:
image = cv2.imread(baseImage1 + "/2102_left.jpg")

In [None]:
plt.title("2102_left original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,250:1800]

In [None]:
cropped_image.shape

In [None]:
plt.title("2102_left cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "2102_left.jpg"), cropped_image)

In [None]:
# 2231_left.jpg

In [None]:
image = cv2.imread(baseImage1 + "/2231_left.jpg")

In [None]:
plt.title("2231_left original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,400:3800]

In [None]:
plt.title("2231_left cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "2231_left.jpg"), cropped_image)

In [None]:
# 1456_right.jpg

In [None]:
image = cv2.imread(baseImage1 + "/1456_right.jpg")

In [None]:
plt.title("1456_right original image")
plt.imshow(image)

In [None]:
# no cropping needed after inspection

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "1456_right.jpg"), image)

In [None]:
# 2229_right.jpg

In [None]:
image = cv2.imread(baseImage1 + "/2229_right.jpg")

In [None]:
plt.title("2229_right original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,400:3800]

In [None]:
plt.title("2229_right cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractVal_dir, "2229_right.jpg"), cropped_image)

In [None]:
tc = 0
vc = 0
tn = 0
vn = 0

for item in os.listdir(cataractTrain_dir):
    tc += 1
for item in os.listdir(cataractVal_dir):
    vc += 1
for item in os.listdir(normalTrain_dir):
    tn += 1
for item in os.listdir(normalVal_dir):
    vn += 1
    
# 400 from 594
print(tc) # 306 from 494 (5 missing image solved)
print(vc) # 100 from 100 (1 missing image solved)
# 484 from 500
print(tn) # 387 from 400 (no missing image)
print(vn) # 97 from 100  (no missing image)

# Filtering dataset 2

In [None]:
# adding image from the second dataset to balance the dataset that will be used
cataract2 = os.listdir(baseCataract2)
normal2 = os.listdir(baseNormal2)
train_cataract2 = cataract2[:94] 
# val_cataract2 = 
train_normal2 = normal2[:13]
val_normal2 = normal2[13:16]

In [None]:
print(len(cataract2))
print(len(normal2))
print(len(train_cataract2))
print(len(train_normal2))
print(len(val_normal2))

In [None]:
def crop_image2(itemList, destinationDirectory, sourceDirectory):
    for image in os.listdir(sourceDirectory):
        if image in itemList:
            image_path = os.path.join(sourceDirectory + "/", image)
            image_temp = cv2.imread(image_path)
            cropped_image = image_temp[0:,430:2150]
            cropped_path = os.path.join(destinationDirectory + "/", image)
            cv2.imwrite(cropped_path, cropped_image)

In [None]:
crop_image2(train_cataract2, cataractTrain_dir, baseCataract2)
crop_image2(train_normal2, normalTrain_dir, baseNormal2)
crop_image2(val_normal2, normalVal_dir, baseNormal2)

In [None]:
tc = 0
vc = 0
tn = 0
vn = 0

for item in os.listdir(cataractTrain_dir):
    tc += 1
for item in os.listdir(cataractVal_dir):
    vc += 1
for item in os.listdir(normalTrain_dir):
    tn += 1
for item in os.listdir(normalVal_dir):
    vn += 1
    

print(tc)
print(vc)

print(tn)
print(vn)

In [None]:
# after checking result in cataract train directory
# some picture was not centered in the cropping
# [cataract_005, cataract_020, cataract_027, cataract_052, cataract_063]
# manually crop one by one

In [None]:
# cataract_005

In [None]:
image = cv2.imread(baseCataract2 + "/cataract_005.png")

In [None]:
plt.title("cataract_005 original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,250:1600]

In [None]:
plt.title("cataract_005 cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "cataract_005.png"), cropped_image)

In [None]:
# cataract_020

In [None]:
image = cv2.imread(baseCataract2 + "/cataract_020.png")

In [None]:
plt.title("cataract_020 original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,250:1600]

In [None]:
plt.title("cataract_020 cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "cataract_020.png"), cropped_image)

In [None]:
# cataract_027

In [None]:
image = cv2.imread(baseCataract2 + "/cataract_027.png")

In [None]:
plt.title("cataract_027 original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,250:1600]

In [None]:
plt.title("cataract_027 cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "cataract_027.png"), cropped_image)

In [None]:
# cataract_052

In [None]:
image = cv2.imread(baseCataract2 + "/cataract_052.png")

In [None]:
plt.title("cataract_052 original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,250:1600]

In [None]:
plt.title("cataract_052 cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "cataract_052.png"), cropped_image)

In [None]:
# cataract_063

In [None]:
image = cv2.imread(baseCataract2 + "/cataract_063.png")

In [None]:
plt.title("cataract_063 original image")
plt.imshow(image)

In [None]:
cropped_image = image[0:,250:1600]

In [None]:
plt.title("cataract_063 cropped image")
plt.imshow(cropped_image)

In [None]:
cv2.imwrite(cropped_path(cataractTrain_dir, "cataract_063.png"), cropped_image)