In [21]:
import os
import cv2
import pandas as pd
from glob import glob

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from time import time

# 基本設定

In [22]:
data_dir_path = './'

# 建立 target labels 清單

In [23]:
target_label_file_name = 'mapping.txt'

target_label_file_path = '/'.join((data_dir_path, target_label_file_name))

with open(target_label_file_path) as f:
    all_lines = [line.split(', ') for line in f.read().splitlines()]

target_labels = dict()
for line in all_lines:
    target_class, target_label = line
    target_labels[target_class] = target_label

In [24]:
target_labels

{'rika': '0', 'risa': '1', 'yui': '2', 'akane': '3', 'neru': '4'}

# 建立資料清單

In [25]:
# 指定存放 train 資料集的資料夾
train_dir = '/'.join((data_dir_path, 'train'))
img_path_list = []
img_class_list = []
for key in target_labels.keys():
    for file_path in glob('{}/{}/*.png'.format(train_dir, key)):
        img_class_list.append(target_labels[key])
        img_path_list.append(file_path)

data_list = pd.DataFrame({'class': img_class_list, 'path': img_path_list})

In [26]:
data_list.head(10)

Unnamed: 0,class,path
0,0,.//train/rika/061.png
1,0,.//train/rika/008.png
2,0,.//train/rika/041.png
3,0,.//train/rika/032.png
4,0,.//train/rika/031.png
5,0,.//train/rika/081.png
6,0,.//train/rika/055.png
7,0,.//train/rika/053.png
8,0,.//train/rika/023.png
9,0,.//train/rika/050.png


## Shuffle and split the list into training set and validation set

In [27]:
# 將資料清單隨機打亂
rand_seed = int(time())
data_list = shuffle(data_list, random_state=rand_seed)
data_list.head(10)

Unnamed: 0,class,path
230,2,.//train/yui/046.png
327,3,.//train/akane/001.png
182,1,.//train/risa/010.png
99,0,.//train/rika/088.png
166,1,.//train/risa/003.png
145,1,.//train/risa/027.png
54,0,.//train/rika/075.png
44,0,.//train/rika/002.png
277,2,.//train/yui/080.png
121,1,.//train/risa/028.png


In [28]:
X = pd.DataFrame(data_list['path'], columns=['path'])
y = pd.DataFrame(data_list['class'], columns=['class'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

經過 train_test_split 切割出來的 train 及 test 資料，samples 跟 labels 清單的 index 順序是一樣的，可以安心拿來訓練 model。

In [29]:
print('Some training samples:\n----------------------')
print(X_train.head())

print('\nSome training labels:\n---------------------')
print(y_train.head())

print('\nSome validation samples:\n------------------------')
print(X_valid.head())

print('\nSome validation labels:\n-----------------------')
print(y_valid.head())

Some training samples:
----------------------
                       path
336  .//train/akane/042.png
411  .//train/akane/018.png
268    .//train/yui/074.png
339  .//train/akane/007.png
118   .//train/risa/042.png

Some training labels:
---------------------
    class
336     3
411     3
268     2
339     3
118     1

Some validation samples:
------------------------
                       path
178   .//train/risa/067.png
376  .//train/akane/075.png
166   .//train/risa/003.png
42    .//train/rika/019.png
366  .//train/akane/044.png

Some validation labels:
-----------------------
    class
178     1
376     3
166     1
42      0
366     3


把 DataFrame 存為 .csv 檔

In [30]:
# Save to csv
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_valid.to_csv('X_valid.csv', index=False)
y_valid.to_csv('y_valid.csv', index=False)

建立用來儲存模型的資料夾

In [31]:
path = 'saved_models'
if not os.path.exists('saved_models'):
    os.mkdir('saved_models')