In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os, json, pathlib, shutil, PIL
import cv2
import itertools

In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image

https://www.kaggle.com/datasets/andrewmvd/lung-and-colon-cancer-histopathological-images

In [3]:
from tensorflow.keras.applications.vgg16 import VGG16

In [4]:
vgg_layer = VGG16(weights= 'imagenet', include_top= False, input_shape=(224, 224, 3))

"resnet_layer = ResNet50V2(weights= 'imagenet', include_top= False, input_shape=(200, 200, 3))\ninception_layer = InceptionV3(weights= 'imagenet', include_top= False, input_shape=(200, 200, 3))\ndensenet_layer = DenseNet121(weights= 'imagenet', include_top= False, input_shape=(200, 200, 3))\n"

In [5]:
folder_dir = "archive/lung_colon_image_set/lung_image_sets/"
file_dir = pathlib.Path(folder_dir)
print(file_dir.exists())

True


In [6]:
total_img_cnt = len(list(file_dir.glob("*/*.JPEG")))
print(total_img_cnt)

15000


In [7]:
class_names = [name for name in os.listdir(folder_dir) if os.path.isdir(os.path.join(folder_dir, name))]
print(class_names)

['lung_aca', 'lung_n', 'lung_scc']


In [8]:
total_nums = []
for class_name in class_names:
  class_dir = pathlib.Path(folder_dir, class_name)
  cl_length = len(list(class_dir.glob("*.JPEG")))
  train_ratio = int(cl_length * 0.7)
  test_ratio = int(cl_length * 0.2)
  valid_ratio = cl_length - (train_ratio + test_ratio)

  nums = np.zeros(cl_length)
  nums[:test_ratio] = 1
  nums[test_ratio : test_ratio + valid_ratio] = 2
  np.random.shuffle(nums)
  total_nums.append(list(nums))

merged_nums = list(itertools.chain.from_iterable(total_nums))

In [9]:
datagen = ImageDataGenerator(rescale=1./255)
batch_size = 300

In [10]:
generator = datagen.flow_from_directory(file_dir, target_size=(224,224), batch_size = batch_size, class_mode= 'categorical', shuffle=False)

filepaths = []
for filepath in generator.filepaths:
  filepaths.append(filepath)
  
filenames = []
for filename in generator.filenames:
  filenames.append(filename)

Found 15000 images belonging to 3 classes.


In [12]:
ground_truth_label = []
file_names = []
for file in filenames:
  f = file.split("\\")
  ground_truth_label.append(f[0])
  file_names.append(f[1])  

index_list = list(range(0, total_img_cnt))

In [13]:
def extract_features(generator, data_num, class_num, feature_shape, pretrained_model):
  features = np.zeros(shape = feature_shape)
  labels = np.zeros(shape=(data_num, class_num))
  i = 0
  for inputs_batch, labels_batch in generator:
    features_batch = pretrained_model.predict(inputs_batch)
    features[i * batch_size : (i+1) * batch_size] = features_batch
    labels[i * batch_size : (i+1) * batch_size] = labels_batch
    i += 1
    if i * batch_size >= data_num:
      break

  return features, labels

In [14]:
vgg_final_layer = list(vgg_layer.layers)[-1].output_shape
vgg_final_layer = list(vgg_final_layer)
print("final layer of VGG16 : " +  str(list(vgg_layer.layers)[-1]) + " and its shape : " + str(vgg_final_layer))

vgg_conv_layers = []
for l in range(len(vgg_layer.layers)):
  layer = vgg_layer.layers[l]
  if 'Conv' not in layer.__class__.__name__:
    continue
  vgg_conv_layers.append((layer.name, layer.output.shape))

vgg_conv_base_shape = []
for i in vgg_final_layer:
  if i != None:
    vgg_conv_base_shape.append(i)
print("conv_base_shape : ", vgg_conv_base_shape)

vgg_feat_shape = tuple([total_img_cnt] + vgg_conv_base_shape)
print(vgg_feat_shape)

vgg_input_dimension = np.prod(vgg_conv_base_shape)
print(vgg_input_dimension)

final layer of VGG16 : <keras.layers.pooling.MaxPooling2D object at 0x000001C0E71A5BC8> and its shape : [None, 7, 7, 512]
conv_base_shape :  [7, 7, 512]
(15000, 7, 7, 512)
25088


In [15]:
vgg_features, vgg_labels = extract_features(generator, total_img_cnt, len(class_names), vgg_feat_shape, vgg_layer)

In [16]:
from json import JSONEncoder
class NumpyArrayEncoder(JSONEncoder):
  def default(self, obj):
    if isinstance(obj, np.ndarray):
      return obj.tolist()
    return JSONEncoder.default(self, obj)


In [17]:
total_labels_int = []
for idx in range(len(vgg_labels)):
  total_labels_int.append(np.argmax(vgg_labels[idx]))

In [18]:
vgg_features = np.reshape(vgg_features, (total_img_cnt, vgg_input_dimension))

In [20]:
for idx in range(len(vgg_features)):
  cur_fea = vgg_features[idx]
  for j in range(len(cur_fea)):
    if cur_fea[j] > 0.0:
      cur_fea[j] = float(f"{cur_fea[j]:.4f}")


In [21]:
total_feas = {"features": vgg_features, "labels" : vgg_labels}
with open("vgg_extracted.json", "w") as outfile:
  json.dump(total_feas, outfile, cls=NumpyArrayEncoder)

In [22]:
models = []

In [23]:
# Add classifier on pre-trained model
vgg_model = keras.models.Sequential()
#vgg_model.add(keras.layers.Reshape((vgg_input_dimension,), input_shape = tuple(vgg_conv_base_shape)))
vgg_model.add(keras.layers.Dense(512, activation='relu', input_dim = vgg_input_dimension))
vgg_model.add(keras.layers.Dense(512, activation='relu'))
vgg_model.add(keras.layers.Dense(len(class_names), activation = 'softmax'))
vgg_model.compile(optimizer=keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

models.append(vgg_model)

In [24]:
'''nums = np.zeros(total_img_cnt)
train_ratio = int(total_img_cnt * 0.7)
test_ratio = int(total_img_cnt * 0.2)
valid_ratio = total_img_cnt - (train_ratio + test_ratio)

print(train_ratio)
print(test_ratio)
print(valid_ratio)

nums[:test_ratio] = 1
nums[test_ratio:test_ratio + valid_ratio] = 2
np.random.shuffle(nums)'''


pmt_order = np.random.permutation(np.arange(total_img_cnt))

10500
3000
1500


In [25]:
vgg_data_df = pd.DataFrame({'index': index_list, 'file_names': file_names, 'feature' : list(vgg_features), 'label' : list(vgg_labels), 'int_label' : total_labels_int, 'assign' : merged_nums})


In [26]:
vgg_data_df

Unnamed: 0,index,file_names,feature,label,int_label,assign
0,0,lungaca1.jpeg,"[0.5731, 0.0, 1.0246, 0.0, 0.1702, 0.0, 0.0, 0...","[1.0, 0.0, 0.0]",0,0.0
1,1,lungaca10.jpeg,"[0.2123, 0.0, 1.8726, 0.0, 0.3249, 0.0, 0.0, 0...","[1.0, 0.0, 0.0]",0,0.0
2,2,lungaca100.jpeg,"[0.4521, 0.0, 0.1181, 0.0, 0.0, 0.0, 0.0, 0.06...","[1.0, 0.0, 0.0]",0,0.0
3,3,lungaca1000.jpeg,"[0.0, 0.0, 1.6311, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[1.0, 0.0, 0.0]",0,0.0
4,4,lungaca1001.jpeg,"[0.1413, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0731,...","[1.0, 0.0, 0.0]",0,1.0
...,...,...,...,...,...,...
14995,14995,lungscc995.jpeg,"[0.2491, 0.0, 1.148, 0.1432, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 1.0]",2,1.0
14996,14996,lungscc996.jpeg,"[0.2842, 0.0, 0.0703, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 1.0]",2,0.0
14997,14997,lungscc997.jpeg,"[0.0, 0.0, 1.2031, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 1.0]",2,0.0
14998,14998,lungscc998.jpeg,"[0.0, 0.0, 0.6882, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 1.0]",2,1.0


In [27]:
data_df_sf = vgg_data_df.iloc[pmt_order]

In [28]:
data_df_sf.head()

Unnamed: 0,index,file_names,feature,label,int_label,assign
6632,6632,lungn2468.jpeg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1523, 0....","[0.0, 1.0, 0.0]",1,0.0
12091,12091,lungscc2881.jpeg,"[0.2156, 0.0, 0.9643, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 1.0]",2,0.0
10568,10568,lungscc151.jpeg,"[0.6224, 0.0, 1.8294, 0.0, 0.0992, 0.0, 0.0, 0...","[0.0, 0.0, 1.0]",2,0.0
3000,3000,lungaca37.jpeg,"[0.2925, 0.0, 1.5497, 0.0, 0.0, 0.0, 0.0, 0.10...","[1.0, 0.0, 0.0]",0,0.0
6290,6290,lungn216.jpeg,"[0.7258, 0.0, 0.0, 0.0035, 0.0, 0.0, 0.0, 0.0,...","[0.0, 1.0, 0.0]",1,2.0


In [29]:
vgg_train_df = data_df_sf[data_df_sf['assign'] == 0]
vgg_test_df = data_df_sf[data_df_sf['assign'] == 1]
vgg_valid_df = data_df_sf[data_df_sf['assign'] == 2]

In [30]:
vgg_train_features = list(vgg_train_df['feature'])
vgg_train_labels = list(vgg_train_df['label'])

vgg_train_features = np.reshape(vgg_train_features, (len(vgg_train_df), vgg_input_dimension))
vgg_train_labels = np.reshape(vgg_train_labels, (len(vgg_train_df), len(class_names)))

vgg_test_features = list(vgg_test_df['feature'])
vgg_test_labels = list(vgg_test_df['label'])

vgg_test_features = np.reshape(vgg_test_features, (len(vgg_test_df), vgg_input_dimension))
vgg_test_labels = np.reshape(vgg_test_labels, (len(vgg_test_df), len(class_names)))

vgg_valid_features = list(vgg_valid_df['feature'])
vgg_valid_labels = list(vgg_valid_df['label'])

vgg_valid_features = np.reshape(vgg_valid_features, (len(vgg_valid_df), vgg_input_dimension))
vgg_valid_labels = np.reshape(vgg_valid_labels, (len(vgg_valid_df), len(class_names)))

In [31]:
vgg_train_history = vgg_model.fit(vgg_train_features, vgg_train_labels, epochs = 15, batch_size = batch_size, validation_data = (vgg_valid_features, vgg_valid_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [32]:
vgg_loss, vgg_acc = vgg_model.evaluate(vgg_test_features, vgg_test_labels)



In [33]:
vgg_test_prediction_score = vgg_model.predict(vgg_test_features)

In [34]:
vgg_test_predicted_label = np.argmax(vgg_test_prediction_score, axis= -1)

In [35]:
vgg_test_df

Unnamed: 0,index,file_names,feature,label,int_label,assign
10471,10471,lungscc1422.jpeg,"[0.3556, 0.0, 1.4198, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 1.0]",2,1.0
4894,4894,lungaca903.jpeg,"[0.5033, 0.0, 1.7585, 0.0, 0.0582, 0.0, 0.0, 0...","[1.0, 0.0, 0.0]",0,1.0
3329,3329,lungaca3996.jpeg,"[0.3309, 0.0, 1.9817, 0.0, 0.1838, 0.0, 0.0, 0...","[1.0, 0.0, 0.0]",0,1.0
9620,9620,lungn657.jpeg,"[0.2525, 0.0, 0.2764, 0.0, 0.0004, 0.0, 0.0, 0...","[0.0, 1.0, 0.0]",1,1.0
11231,11231,lungscc2106.jpeg,"[0.0, 0.0, 0.8669, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 1.0]",2,1.0
...,...,...,...,...,...,...
1308,1308,lungaca2176.jpeg,"[0.9655, 0.0, 1.7963, 0.0, 0.0, 0.0, 0.0, 0.11...","[1.0, 0.0, 0.0]",0,1.0
8362,8362,lungn4024.jpeg,"[0.601, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0, 1.0, 0.0]",1,1.0
494,494,lungaca1443.jpeg,"[0.2403, 0.0, 2.5408, 0.0, 0.3922, 0.0, 0.0, 0...","[1.0, 0.0, 0.0]",0,1.0
3946,3946,lungaca4550.jpeg,"[0.394, 0.0, 0.1738, 0.0, 0.0, 0.0, 0.0, 0.033...","[1.0, 0.0, 0.0]",0,1.0


In [36]:
test_result = {}
test_result['filename'] = list(vgg_test_df['file_names'])
#test_result['prediction_score'] = vgg_test_prediction_score
test_result['predicted_label'] = vgg_test_predicted_label

In [None]:
with open("vgg_result.json", "w") as outfile:
  json.dump(test_result, outfile, indent=3, cls=NumpyArrayEncoder)