In [1]:
# 기본 환경 로드
%run ./env.ipynb

# ETL

EDA를 통해 데이터 분석이 어느정도 완료되면, ETL를 통해 모델링 하기 위한 데이터 셋을 만듭니다.

In [2]:
from utils import *

sdate = get_env_sdate(default = "2018070108")
path_base = get_env_path_base(default = "/root/mnt/dfs/notebooks-skp/mnist")
path_data = get_env_path_date(default = "/root/mnt/dfs/data/mnist")

print("sdate: {}".format(sdate))
print("path_base: {}".format(path_base))
print("path_data: {}".format(path_data))

sdate: 2018070108
path_base: /root/mnt/dfs/notebooks-skp/mnist
path_data: /root/mnt/dfs/data/mnist


In [3]:
# 데이터 로드
import os
from sklearn.externals import joblib 

path_raw = os.path.join(path_data, "raw")
path_raw_sdate = os.path.join(path_raw, sdate)

path_train_xs = os.path.join(path_raw_sdate, "train_xs.pkl")
path_train_ys = os.path.join(path_raw_sdate, "train_ys.pkl")
np_train_xs = joblib.load(path_train_xs)
np_train_ys = joblib.load(path_train_ys)
print(np_train_xs.shape, np_train_ys.shape)

path_test_xs = os.path.join(path_raw_sdate, "test_xs.pkl")
path_test_ys = os.path.join(path_raw_sdate, "test_ys.pkl")
np_test_xs = joblib.load(path_test_xs)
np_test_ys = joblib.load(path_test_ys)
print(np_test_xs.shape, np_test_ys.shape)

(60000, 28, 28) (60000,)
(10000, 28, 28) (10000,)


In [4]:
# 전처리 - 28x28 2D 이미지를 스케일링을 위해 784 1D 로 변환합니다. 
img_rows, img_cols = np_train_xs.shape[1], np_train_xs.shape[2]
dim_x = img_rows * img_cols
dim_y = 10
np_train_xs = np_train_xs.reshape(-1, dim_x).astype(float)
np_test_xs = np_test_xs.reshape(-1, dim_x).astype(float)
print(np_train_xs.shape)
print(np_test_xs.shape)

(60000, 784)
(10000, 784)


In [5]:
# 전처리 - 0~1 사이로 데이터 Min Max Scaling
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
np_train_xs = scaler.fit_transform(np_train_xs)

path_etl = os.path.join(path_data, "etl")
path_etl_sdate = os.path.join(path_etl, sdate)
os.makedirs(path_etl_sdate, exist_ok=True)

path_scaler = os.path.join(path_etl_sdate, "cnn-scaler.pkl")
joblib.dump(scaler, path_scaler)

scaler = joblib.load(path_scaler)
np_test_xs = scaler.transform(np_test_xs)

['/root/mnt/dfs/data/mnist/etl/2018070108/cnn-scaler.pkl']

In [6]:
# CNN 모델에 맞도록 28x28x1 3D 형대로 변환 합니다.
np_train_xs = np_train_xs.reshape(np_train_xs.shape[0], img_rows, img_cols, 1)
np_test_xs = np_test_xs.reshape(np_test_xs.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)

In [7]:
# 전처리 - 레이블을 Ont Hot 인코팅 
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(sparse=False)
print(np_train_ys[0:5])
np_train_ys = enc.fit_transform(np_train_ys.reshape(-1, 1))
print(np_train_ys[0:5])

np_test_ys = enc.fit_transform(np_test_ys.reshape(-1, 1))

[5 0 4 1 9]
[[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [8]:
# 데이터 저장
from sklearn.externals import joblib 

path_train_xs = os.path.join(path_etl_sdate, "cnn-train_xs.pkl")
path_train_ys = os.path.join(path_etl_sdate, "cnn-train_ys.pkl")
joblib.dump(np_train_xs, path_train_xs)
joblib.dump(np_train_ys, path_train_ys)

path_test_xs = os.path.join(path_etl_sdate, "cnn-test_xs.pkl")
path_test_ys = os.path.join(path_etl_sdate, "cnn-test_ys.pkl")
joblib.dump(np_test_xs, path_test_xs)
joblib.dump(np_test_ys, path_test_ys)

['/root/mnt/dfs/data/mnist/etl/2018070108/cnn-train_xs.pkl']

['/root/mnt/dfs/data/mnist/etl/2018070108/cnn-train_ys.pkl']

['/root/mnt/dfs/data/mnist/etl/2018070108/cnn-test_xs.pkl']

['/root/mnt/dfs/data/mnist/etl/2018070108/cnn-test_ys.pkl']