In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import h5py
import time
import random
import os
from datetime import datetime, timedelta
np.random.seed(1)

In [3]:
import tensorflow as tf

In [3]:
# Tensorflow 버전 확인
tf.__version__

'2.1.0'

In [4]:
# GPU 사용 여부 확인
print(tf.test.is_built_with_cuda())
print(tf.config.list_physical_devices('GPU'))

True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# 이미지 데이터 처리

## 이미 훈련된 VGG16 모델을 사용한 전이 학습 및 특성 추출

In [5]:
from PIL import Image
from tensorflow.keras import Input
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers, initializers, regularizers, metrics
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout

In [6]:
from tensorflow.keras.applications import VGG16
conv_base = VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(150, 150, 3))

In [7]:
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150, 150, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 150, 150, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 150, 150, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 75, 75, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 75, 75, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 75, 75, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 37, 37, 128)       0     

In [10]:
# 수집된 총 이미지 개수 : 153,576
# 이미지가 존재하지 않는 이미지 파일 : 1
# 제거 후 총 이미지 개수 : 153,575
# 데이터 양이 충분히 많기에 image augmentation은 할 필요가 없다고 판단
# 특성추출이 수집된 이미지 데이터에 더 적합하도록 맞추기 위해 fine-tuning방식 사용

In [8]:
# 미세조정(fine-tuning) 시도

conv_base.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x7fcb84d71c90>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcbb43b9ad0>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb0abde950>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fcb0abdeed0>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb00120710>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb00120110>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fcb001339d0>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb00139890>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb00144f90>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb0014ac50>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x7fcb0014ffd0>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb00156e50>,
 <tensorflow.python.keras.layers.convolutional.Conv2D at 0x7fcb000e2d50>,
 <tensorflow.python.keras.layers.con

In [9]:
# block5_conv1, block5_conv2, block5_conv3 --> fine-tuning
conv_base.trainable = True

set_trainable = False
for layer in conv_base.layers:
    if layer.name == 'block5_conv1':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False

In [10]:
conv_base.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150, 150, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 150, 150, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 150, 150, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 75, 75, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 75, 75, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 75, 75, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 37, 37, 128)       0     

In [11]:
#디렉토리에서 이미지 로드 및 generator 생성
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(rescale=1./255)
batch_size = 512

def extract_features(directory, sample_count):
    """
    디렉토리로 부터 Image들을 읽어 들여 VGG16의 특성추출 레이어를 이용해 이미지 특성 추출
    directory: 이미지를 가져올 디렉토리
    sample_count: 학습시킬 이미지 개수 (이걸로 총 몇개의 이미지를 만들지)
    """
    #추출된 이미지 특성들을 저장할 ndarray변수.  array 크기: (개수, conv_base의 출력 shape)
    features = np.zeros(shape=(sample_count, 4, 4, 512))

    #ImageDataGenerator를 이용해 이미지 가져오기.
    generator = datagen.flow_from_directory(directory, #디렉토리로부터 이미지를 읽어와서
                                        target_size=(150, 150), #모든 이미지들 사이즈를 150, 150으로 맞출거다
                                        batch_size=batch_size, #한번에 몇장 만들지
                                        class_mode=None)
    
    
    i = 0  #predict한 이미지 수를 저장해 sample_count보다 커지면 break 한다.
    for inputs_batch in generator: #input data와 label이 나올거잖아, 고양이 관련 데이터면 그게 고양이다라는 것도 같이 나오니까
        #VGG16을 통해 나오는 특성맵을 저장
        feature_map_batch = conv_base.predict(inputs_batch)
        features[i * batch_size : (i + 1) * batch_size] = feature_map_batch #index
        
        i += 1
        if i * batch_size >= sample_count:
            break
    return features, generator.filenames #우리가 가져온 이미지들이 VGG16 layer를 통과하고 거쳐 나온 feature map들이 들어있다

In [None]:
# image feature extraction
# 이미지 특성 추출
try:
    image_dir = './image_2015_2019_folder'
    features, filenames = extract_features(image_dir, 153575)
    #features.shape
    image_data = features.reshape(153575, 4*4*512)
    #image_data.shape
    df_image = pd.DataFrame(image_data)
    try:
        df_image['index'] = [filename.split('/')[1].split('.')[0] for filename in filenames]
        df_image['index'] = df_image['index'].astype('int64')
        df_image = df_image.set_index('index').sort_index().reset_index()
        df_image.to_pickle('image_feature_df.pkl')
    except Exception as e:
        df_image['index'] = filenames
        df_image.to_pickle('image_feature_df.pkl')
        
        filename_split_error = []
        filename_split_error.append(e)
        split_error = pd.DataFrame(filename_split_error, columns=['error'])
        split_error.to_csv('filename_split_error.csv', index=False)

except Exception as ex:
    error_list = []
    error_list.append(ex)
    error = pd.DataFrame(error_list, columns=['error'])
    error.to_csv('image_feature_extraction_failed.csv', index=False)

Found 153575 images belonging to 1 classes.


  "Palette images with Transparency expressed in bytes should be "


In [1]:
import pandas as pd
image_df = pd.read_pickle('image_df.pkl')

In [9]:
image_df.shape

(153575, 8193)

In [None]:
# 데이터 크기가 너무 큽니다. 10GB를 넘어요.

In [11]:
# 결국 모든 컬럼을 --> 평균으로 계산하여 컬럼을 하나로만 만들었습니다
avg_image_df = image_df.mean(1)
avg_image_df.shape

(153575,)

In [13]:
type(avg_image_df)

pandas.core.series.Series

In [15]:
avg_image_df.to_pickle('avg_image_features.pkl')