# [Module 1.0] Cifar10 데이터 세트 다운로드 및 TFRecord 생성

본 워크샵의 모든 노트북은 **<font color="red">conda_tensorflow2_p36</font>** 를 사용합니다.

# 1. 기본 세팅
- 텐서플로우 버전을 2.4.1 인지 확인 합니다.

In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


# 2. 전처리 유틸리티 함수

In [2]:
import argparse
import os
import sys

import tarfile
from six.moves import cPickle as pickle
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

CIFAR_FILENAME = 'cifar-10-python.tar.gz'
CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'


def download_and_extract(data_dir):
    '''
    Cifar10 데이타를 다운로드하고, 압축을 푼다.
    '''
    # 인자로 입력되는 경로의 폴더 생성
    file_dir = os.path.join(os.getcwd(), data_dir)
    os.makedirs(file_dir, exist_ok=True)
    print("data folder: ", file_dir)
    
    file_dir = os.path.join(file_dir,CIFAR_FILENAME)
    # 해당 파일을 다운로드 함.
    download_dir = tf.keras.utils.get_file(fname=file_dir, origin=CIFAR_DOWNLOAD_URL)

    # 압추 해제
    response = tarfile.open(download_dir,
               'r:gz').extractall(data_dir)

def _get_file_names():
    '''
    train, validataion, eval 의 데이터 세트의 파일 이름을 제공 함
    '''
    file_names = {}
    file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
    file_names['validation'] = ['data_batch_5']
    file_names['eval'] = ['test_batch']
    return file_names

    
def read_pickle_from_file(filename):
    '''
    pickle 파일 일기
    '''
    with tf.io.gfile.GFile(filename, 'rb') as f:    
        if sys.version_info >= (3, 0):
            data_dict = pickle.load(f, encoding='bytes')
        else:
            data_dict = pickle.load(f)
    return data_dict

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))



def convert_to_tfrecord(input_files, output_file):
    """
    pickle 파일을 읽어서 TFRecord로 저장한다.
    """
    print('Generating %s' % output_file)
    with tf.io.TFRecordWriter(output_file) as record_writer:
        # pickle 파일을 읽어서, data, labels 의 값을 추출
        for input_file in input_files:
            data_dict = read_pickle_from_file(input_file)
            data = data_dict[b'data']
            labels = data_dict[b'labels']

            num_entries_in_batch = len(labels)
            print("num_entries_in_batch: ", num_entries_in_batch)
            
        # 한개의 이미지씩 data, labels를 bytes, int 로 직렬화하여 TF Example 을 만든다
        # Example 을 TFRcord  로 저장한다
        for i in range(num_entries_in_batch):
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image': _bytes_feature(data[i].tobytes()),
                    'label': _int64_feature(labels[i])
                }))
            record_writer.write(example.SerializeToString())


# 3. 데이타 다운 로드 및 압축 풀기

In [3]:
import os

data_dir = 'data/cifar10'
print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
download_and_extract(data_dir)
os.listdir(data_dir)


Download from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz and extract.
data folder:  /home/ec2-user/SageMaker/SageMaker-Tensorflow-Step-By-Step/code/phase0/data/cifar10


['cifar-10-python.tar.gz',
 'cifar-10-batches-py',
 'validation',
 'train',
 'eval',
 '.ipynb_checkpoints']

In [4]:
file_names = _get_file_names()
print("file_names: \n", file_names)
input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
print("files folder: ", input_dir)

file_names: 
 {'train': ['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4'], 'validation': ['data_batch_5'], 'eval': ['test_batch']}
files folder:  data/cifar10/cifar-10-batches-py


## 4. Pickle 파일을 읽어서 3 개의 TFRecord 파일 생성
- 훈련
    - input_files:  ['data/cifar10/cifar-10-batches-py/data_batch_1', 'data/cifar10/cifar-10-batches-py/data_batch_2', 'data/cifar10/cifar-10-batches-py/data_batch_3', 'data/cifar10/cifar-10-batches-py/data_batch_4']
    - output_file:  data/cifar10/train/train.tfrecords
- 검증    
    - input_files:  ['data/cifar10/cifar-10-batches-py/data_batch_5']
    - output_file:  data/cifar10/validation/validation.tfrecords
- 테스트
    - input_files:  ['data/cifar10/cifar-10-batches-py/test_batch']
    - output_file:  data/cifar10/eval/eval.tfrecords


- 아래 함수에 입력 파일, 출력 파일을 제공하여 최종 TF 레코드 파일을 생성
    - convert_to_tfrecord(input_files, output_file)    

In [5]:
# 훈련, 검증, 테스트의 3번 반복 함.
for mode, files in file_names.items(): 
    input_files = [os.path.join(input_dir, f) for f in files]
    output_file = os.path.join(data_dir+'/'+mode, mode + '.tfrecords')

    print("\nMode: ", mode)
    print("input_files: ", input_files)
    print("output_file: ", output_file)    
    
    if not os.path.exists(data_dir+'/'+mode):
        os.makedirs(data_dir+'/'+mode)
    try:
        os.remove(output_file)
    except OSError:
        pass
    
    # Convert to tf.train.Example and write the to TFRecords.
    convert_to_tfrecord(input_files, output_file)



Mode:  train
input_files:  ['data/cifar10/cifar-10-batches-py/data_batch_1', 'data/cifar10/cifar-10-batches-py/data_batch_2', 'data/cifar10/cifar-10-batches-py/data_batch_3', 'data/cifar10/cifar-10-batches-py/data_batch_4']
output_file:  data/cifar10/train/train.tfrecords
Generating data/cifar10/train/train.tfrecords
num_entries_in_batch:  10000
num_entries_in_batch:  10000
num_entries_in_batch:  10000
num_entries_in_batch:  10000

Mode:  validation
input_files:  ['data/cifar10/cifar-10-batches-py/data_batch_5']
output_file:  data/cifar10/validation/validation.tfrecords
Generating data/cifar10/validation/validation.tfrecords
num_entries_in_batch:  10000

Mode:  eval
input_files:  ['data/cifar10/cifar-10-batches-py/test_batch']
output_file:  data/cifar10/eval/eval.tfrecords
Generating data/cifar10/eval/eval.tfrecords
num_entries_in_batch:  10000
