In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
print("sklearn version: ", sklearn.__version__)
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
print("TF version: ", tf.__version__)
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# GPU test
print("GPU installed: ",tf.test.is_built_with_gpu_support())

# To prevent "CUDNN_STATUS_ALLOC_FAILED" error with GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)
    
# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


sklearn version:  1.2.2



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



TF version:  2.12.0
No GPU was detected. CNNs can be very slow without a GPU.
GPU installed:  False


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.layers import BatchNormalization, GlobalAveragePooling2D, MaxPooling2D, Add,  Dense, Conv2D, Activation
from keras.models import Sequential

# 나. Dataset 분류: training/validation/test dataset으로 분배

### data load

In [30]:
X_train=pd.read_csv('emnist-byclass-train.csv',header=None)
X_test=pd.read_csv('emnist-byclass-test.csv',header=None)

In [3]:
def data_split(data_type_train,split_ratio=0.1):
    split = StratifiedShuffleSplit(n_splits=1, test_size=split_ratio, random_state=42)
    for train_idx, test_idx in split.split(data_type_train, data_type_train[0]):
        x_train = data_type_train.loc[train_idx]
        x_valid = data_type_train.loc[test_idx]
    return x_train,x_valid

In [4]:
def data_load(data_type_train,data_type_valid,data_type_test):
    x_train =data_type_train.iloc[:, 1:].values
    y_train = data_type_train.iloc[:, 0].values
    x_valid =data_type_valid.iloc[:, 1:].values
    y_valid = data_type_valid.iloc[:, 0].values
    x_test = data_type_test.iloc[:, 1:].values
    y_test = data_type_test.iloc[:, 0].values
    x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
    x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
    return x_train,y_train,x_valid,y_valid,x_test,y_test

### data split(train->train&valid)

In [33]:
X_train,X_valid=data_split(X_train,0.235)

### data split(785->1,784)
### ie. split label & feature

In [35]:
X_train,y_train,X_valid,y_valid,X_test,y_test=data_load(X_train,X_valid,X_test)

### npz 형태로 저장
#### 실습하는 장소(혹은 pc)가 바뀌더라도 동일한 데이터로 사용가능 & 압축저장 가능

In [36]:
np.save('X_train',np.array(X_train,dtype=np.uint8))
np.save('y_train',np.array(y_train,dtype=np.uint8))
np.save('X_valid',np.array(X_valid,dtype=np.uint8))
np.save('y_valid',np.array(y_valid,dtype=np.uint8))
np.save('X_test',np.array(X_test,dtype=np.uint8))
np.save('y_test',np.array(y_test,dtype=np.uint8))

In [5]:
import csv
def load_Emist():
    X_train=np.load('X_train.npy')
    y_train=np.load('y_train.npy')
    X_valid=np.load('X_valid.npy')
    y_valid=np.load('y_valid.npy')
    X_test=np.load('X_test.npy')
    y_test=np.load('y_test.npy')
    return X_train, y_train, X_valid, y_valid, X_test,y_test


### load data from npz. files

In [6]:
X_train, y_train, X_valid, y_valid, X_test, y_test = load_Emist()
X_train.shape,X_train.dtype

((533917, 28, 28, 1), dtype('uint8'))

### tf.data.Dataset으로 만들기  
#### 교제 13장 참고

In [7]:
#data, batch size 성정
train_size=len(X_train)
valid_size=len(X_valid)
batch_size=32

In [8]:
X_train=np.reshape(X_train,[-1,784])
X_valid=np.reshape(X_valid,[-1,784])

In [9]:
y_train=np.reshape(y_train,[-1,1])
y_valid=np.reshape(y_valid,[-1,1])

In [10]:
train_full = np.append(X_train,y_train,axis=1)
valid_full = np.append(X_valid,y_valid,axis=1)

In [49]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    
    Emnist_dir = os.path.join("datasets", "Emnist")
    os.makedirs(Emnist_dir, exist_ok=True)
    path_format = os.path.join(Emnist_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        try:
            with open(part_csv, "xt", encoding="utf-8") as f:
                if header is not None:
                    f.write(header)
                    f.write("\n")
                for row_idx in row_indices:
                    f.write(",".join([repr(col) for col in data[row_idx]]))
                    f.write("\n")
        except:
            continue
    return filepaths

In [4]:
train_filepaths = save_to_multiple_csv_files(train_full, "train")
valid_filepaths = save_to_multiple_csv_files(valid_full, "valid")

### 각 모델에 맞는 preprocess 함수 정의 & tf.data.Dataset로 만들어 주기
#### 나) 에서는 mobilenet.preprocess_input에 대한 preprocess만 나타내었으나 
#### 이후 모델을 사용할때는 각 모델에 대해 맞게 바꿔서 사용하였다.

In [12]:
n_inputs = X_train.shape[-1]
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    x=tf.reshape(x,[28,28,1])
    y = tf.stack(fields[-1:])
    resized_image = tf.image.resize(x, [224, 224])
    if resized_image.shape[2] == 1:
        temp=tf.concat([resized_image,resized_image],2)
        resized_image=tf.concat([temp,resized_image],2)
    final_image = keras.applications.mobilenet.preprocess_input(resized_image)
    
    return final_image, y

In [13]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [14]:
train_set = csv_reader_dataset(train_filepaths,batch_size=batch_size, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths,batch_size=batch_size, repeat=None)