In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os, json, pathlib, shutil, PIL
import matplotlib.pyplot as plt
import cv2
import itertools

In [2]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image

https://www.kaggle.com/code/shadym0hamed/lung-cancer-classification/data

In [3]:
from tensorflow.keras.applications.vgg16 import VGG16

In [4]:
vgg_layer = VGG16(weights= 'imagenet', include_top= False, input_shape=(224, 224, 3))

"resnet_layer = ResNet50V2(weights= 'imagenet', include_top= False, input_shape=(200, 200, 3))\ninception_layer = InceptionV3(weights= 'imagenet', include_top= False, input_shape=(200, 200, 3))\ndensenet_layer = DenseNet121(weights= 'imagenet', include_top= False, input_shape=(200, 200, 3))\n"

In [5]:
folder_dir = "lung_cancer/Dataset/"
file_dir = pathlib.Path(folder_dir)
print(file_dir.exists())

True


In [6]:
total_img_cnt = len(list(file_dir.glob("*/*.jpg")))
print(total_img_cnt)

2002


In [7]:
class_names = [name for name in os.listdir(folder_dir) if os.path.isdir(os.path.join(folder_dir, name))]
print(class_names)

['Adeno', 'Carci', 'Normal', 'Squamos']


In [8]:
total_nums = []
for class_name in class_names:
  class_dir = pathlib.Path(folder_dir, class_name)
  #each_class_imgs = list(class_dir.glob("*.jpg"))
  cl_length = len(list(class_dir.glob("*.jpg")))
  train_ratio = int(cl_length * 0.7)
  test_ratio = int(cl_length * 0.2)
  valid_ratio = cl_length - (train_ratio + test_ratio)

  nums = np.zeros(cl_length)
  nums[:test_ratio] = 1
  nums[test_ratio : test_ratio + valid_ratio] = 2
  np.random.shuffle(nums)
  total_nums.append(list(nums))

merged_nums = list(itertools.chain.from_iterable(total_nums))

In [9]:
len(merged_nums)

2002

In [10]:
datagen = ImageDataGenerator(rescale=1./255)
batch_size = 32

In [11]:
generator = datagen.flow_from_directory(file_dir, target_size=(224,224), batch_size = batch_size, class_mode= 'categorical', shuffle=False)

filepaths = []
for filepath in generator.filepaths:
  filepaths.append(filepath)
  
filenames = []
for filename in generator.filenames:
  filenames.append(filename)

Found 2002 images belonging to 4 classes.


In [12]:
len(filenames)

2002

In [13]:
ground_truth_label = []
file_names = []
for file in filenames:
  f = file.split("\\")
  ground_truth_label.append(f[0])
  file_names.append(f[1])  

index_list = list(range(0, total_img_cnt))

In [14]:
def extract_features(generator, data_num, class_num, feature_shape, pretrained_model):
  features = np.zeros(shape = feature_shape)
  labels = np.zeros(shape=(data_num, class_num))
  #generator = datagen.flow_from_directory(directory, target_size=(224, 224), batch_size = batch_size, class_mode= 'categorical', shuffle=False)
  i = 0
  for inputs_batch, labels_batch in generator:
    features_batch = pretrained_model.predict(inputs_batch)
    features[i * batch_size : (i+1) * batch_size] = features_batch
    labels[i * batch_size : (i+1) * batch_size] = labels_batch
    i += 1
    if i * batch_size >= data_num:
      break

  return features, labels

In [15]:
vgg_final_layer = list(vgg_layer.layers)[-1].output_shape
vgg_final_layer = list(vgg_final_layer)
print("final layer of VGG16 : " +  str(list(vgg_layer.layers)[-1]) + " and its shape : " + str(vgg_final_layer))

vgg_conv_layers = []
for l in range(len(vgg_layer.layers)):
  layer = vgg_layer.layers[l]
  if 'Conv' not in layer.__class__.__name__:
    continue
  vgg_conv_layers.append((layer.name, layer.output.shape))

vgg_conv_base_shape = []
for i in vgg_final_layer:
  if i != None:
    vgg_conv_base_shape.append(i)
print("conv_base_shape : ", vgg_conv_base_shape)

vgg_feat_shape = tuple([total_img_cnt] + vgg_conv_base_shape)
print(vgg_feat_shape)

vgg_input_dimension = np.prod(vgg_conv_base_shape)
print(vgg_input_dimension)

final layer of VGG16 : <keras.layers.pooling.MaxPooling2D object at 0x00000176DA944F08> and its shape : [None, 7, 7, 512]
conv_base_shape :  [7, 7, 512]
(2002, 7, 7, 512)
25088


In [16]:
vgg_features, vgg_labels = extract_features(generator, total_img_cnt, len(class_names), vgg_feat_shape, vgg_layer)

In [17]:
from json import JSONEncoder
class NumpyArrayEncoder(JSONEncoder):
  def default(self, obj):
    if isinstance(obj, np.ndarray):
      return obj.tolist()
    return JSONEncoder.default(self, obj)


In [18]:
total_labels_int = []
for idx in range(len(vgg_labels)):
  total_labels_int.append(np.argmax(vgg_labels[idx]))

In [19]:
vgg_features = np.reshape(vgg_features, (total_img_cnt, vgg_input_dimension))

In [20]:
vgg_features[0]

array([0.06951904, 0.        , 0.        , ..., 0.        , 0.89786679,
       0.        ])

In [21]:
for idx in range(len(vgg_features)):
  cur_fea = vgg_features[idx]
  #cur_fea_2 = np.where(len(str(cur_fea)) > 3, cur_fea, f"{cur_fea:.4f}")
  for j in range(len(cur_fea)):
    if cur_fea[j] > 0.0:
      cur_fea[j] = float(f"{cur_fea[j]:.4f}")


In [22]:
total_feas = {"features": vgg_features, "labels" : vgg_labels}
with open("vgg_extracted_lung_cancer.json", "w") as outfile:
  json.dump(total_feas, outfile, cls=NumpyArrayEncoder)

In [23]:
models = []

In [24]:
# Add classifier on pre-trained model
vgg_model = keras.models.Sequential()
#vgg_model.add(keras.layers.Reshape((vgg_input_dimension,), input_shape = tuple(vgg_conv_base_shape)))
vgg_model.add(keras.layers.Dense(512, activation='relu', input_dim = vgg_input_dimension))
vgg_model.add(keras.layers.Dense(512, activation='relu'))
vgg_model.add(keras.layers.Dense(len(class_names), activation = 'softmax'))
vgg_model.compile(optimizer=keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

models.append(vgg_model)

In [25]:
nums = np.zeros(total_img_cnt)
train_ratio = int(total_img_cnt * 0.7)
test_ratio = int(total_img_cnt * 0.2)
valid_ratio = total_img_cnt - (train_ratio + test_ratio)

print(train_ratio)
print(test_ratio)
print(valid_ratio)

nums[:test_ratio] = 1
nums[test_ratio:test_ratio + valid_ratio] = 2
np.random.shuffle(nums)


pmt_order = np.random.permutation(np.arange(total_img_cnt))

1401
400
201


In [26]:
vgg_data_df = pd.DataFrame({'index': index_list, 'file_names': file_names, 'feature' : list(vgg_features), 'label' : list(vgg_labels), 'int_label' : total_labels_int, 'assign' : merged_nums})


In [27]:
vgg_data_df

Unnamed: 0,index,file_names,feature,label,int_label,assign
0,0,1-01 (10).jpg,"[0.0695, 0.0, 0.0, 0.0, 0.245, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0]",0,0.0
1,1,1-01 (11).jpg,"[0.0524, 0.0, 0.0, 0.0181, 0.2809, 0.0, 0.0, 0...","[1.0, 0.0, 0.0, 0.0]",0,2.0
2,2,1-01 (12).jpg,"[0.3959, 0.0, 0.0, 0.0567, 0.1739, 0.0, 0.0, 0...","[1.0, 0.0, 0.0, 0.0]",0,0.0
3,3,1-01 (13).jpg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0]",0,1.0
4,4,1-01.jpg,"[0.1423, 0.0, 0.0, 0.0664, 0.3667, 0.0, 0.0, 0...","[1.0, 0.0, 0.0, 0.0]",0,1.0
...,...,...,...,...,...,...
1997,1997,1-86.jpg,"[0.2823, 0.0, 0.0, 0.0063, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 1.0]",3,1.0
1998,1998,1-87.jpg,"[0.2797, 0.0, 0.0, 0.0018, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 1.0]",3,0.0
1999,1999,1-88.jpg,"[0.2592, 0.0, 0.0, 0.0145, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 1.0]",3,0.0
2000,2000,2-1 (2).jpg,"[0.1332, 0.0, 0.0, 0.0, 0.0581, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 1.0]",3,2.0


In [28]:
data_df_sf = vgg_data_df.iloc[pmt_order]

In [29]:
data_df_sf.head()

Unnamed: 0,index,file_names,feature,label,int_label,assign
1892,1892,1-440.jpg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0]",3,0.0
1353,1353,6 (3) - Copy.jpg,"[0.2611, 0.0, 0.0, 0.0, 0.0026, 0.0, 0.0, 0.0,...","[0.0, 0.0, 1.0, 0.0]",2,0.0
307,307,1-29 (6).jpg,"[0.0589, 0.0, 0.0, 0.019, 0.3036, 0.0, 0.0, 0....","[1.0, 0.0, 0.0, 0.0]",0,1.0
323,323,1-31 (5).jpg,"[0.0809, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[1.0, 0.0, 0.0, 0.0]",0,0.0
1893,1893,1-441.jpg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0]",3,0.0


In [30]:
vgg_train_df = data_df_sf[data_df_sf['assign'] == 0]
vgg_test_df = data_df_sf[data_df_sf['assign'] == 1]
vgg_valid_df = data_df_sf[data_df_sf['assign'] == 2]

In [31]:
vgg_train_features = list(vgg_train_df['feature'])
vgg_train_labels = list(vgg_train_df['label'])

vgg_train_features = np.reshape(vgg_train_features, (len(vgg_train_df), vgg_input_dimension))
vgg_train_labels = np.reshape(vgg_train_labels, (len(vgg_train_df), len(class_names)))

vgg_test_features = list(vgg_test_df['feature'])
vgg_test_labels = list(vgg_test_df['label'])

vgg_test_features = np.reshape(vgg_test_features, (len(vgg_test_df), vgg_input_dimension))
vgg_test_labels = np.reshape(vgg_test_labels, (len(vgg_test_df), len(class_names)))

vgg_valid_features = list(vgg_valid_df['feature'])
vgg_valid_labels = list(vgg_valid_df['label'])

vgg_valid_features = np.reshape(vgg_valid_features, (len(vgg_valid_df), vgg_input_dimension))
vgg_valid_labels = np.reshape(vgg_valid_labels, (len(vgg_valid_df), len(class_names)))

In [32]:
vgg_train_history = vgg_model.fit(vgg_train_features, vgg_train_labels, epochs = 5, batch_size = batch_size, validation_data = (vgg_valid_features, vgg_valid_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
vgg_loss, vgg_acc = vgg_model.evaluate(vgg_test_features, vgg_test_labels)



In [34]:
vgg_test_prediction_score = vgg_model.predict(vgg_test_features)

In [35]:
vgg_test_predicted_label = np.argmax(vgg_test_prediction_score, axis= -1)

In [36]:
vgg_test_df

Unnamed: 0,index,file_names,feature,label,int_label,assign
307,307,1-29 (6).jpg,"[0.0589, 0.0, 0.0, 0.019, 0.3036, 0.0, 0.0, 0....","[1.0, 0.0, 0.0, 0.0]",0,1.0
1778,1778,1-337.jpg,"[0.0807, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 1.0]",3,1.0
1339,1339,4 - Copy - Copy.jpg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 1.0, 0.0]",2,1.0
1645,1645,1-216.jpg,"[0.0154, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 1.0]",3,1.0
1261,1261,13 - Copy (2).jpg,"[0.3362, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1673,...","[0.0, 0.0, 1.0, 0.0]",2,1.0
...,...,...,...,...,...,...
39,39,1-04 (9).jpg,"[0.0821, 0.0, 0.0, 0.006, 0.2262, 0.0, 0.0, 0....","[1.0, 0.0, 0.0, 0.0]",0,1.0
1949,1949,1-492.jpg,"[0.0, 0.0, 0.0, 0.0, 0.1099, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 1.0]",3,1.0
1560,1560,1-139.jpg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0]",3,1.0
992,992,1-209 (2).jpg,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0]",1,1.0


In [37]:
test_result = {}
test_result['filename'] = list(vgg_test_df['file_names'])
#test_result['prediction_score'] = vgg_test_prediction_score
test_result['predicted_label'] = vgg_test_predicted_label

In [38]:
with open("vgg_result_lung_cancer.json", "w") as outfile:
  json.dump(test_result, outfile, indent=3, cls=NumpyArrayEncoder)