In [1]:
import os
import cv2
import pandas as pd
from glob import glob

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from time import time

# 基本設定

In [6]:
data_dir_path = './'

# 建立 target labels 清單

In [7]:
target_label_file_name = 'mapping.txt'

target_label_file_path = '/'.join((data_dir_path, target_label_file_name))

with open(target_label_file_path) as f:
    all_lines = [line.split(', ') for line in f.read().splitlines()]

target_labels = dict()
for line in all_lines:
    target_class, target_label = line
    target_labels[target_class] = target_label

In [8]:
target_labels

{'CALsuburb': '9',
 'PARoffice': '7',
 'bedroom': '12',
 'coast': '10',
 'forest': '4',
 'highway': '14',
 'industrial': '2',
 'insidecity': '3',
 'kitchen': '0',
 'livingroom': '5',
 'mountain': '8',
 'opencountry': '6',
 'store': '11',
 'street': '1',
 'tallbuilding': '13'}

# 建立資料清單

In [9]:
# 指定存放 train 資料集的資料夾
train_dir = '/'.join((data_dir_path, 'train'))
img_path_list = []
img_class_list = []
for key in target_labels.keys():
    for file_path in glob('{}/{}/*.jpg'.format(train_dir, key)):
        img_class_list.append(target_labels[key])
        img_path_list.append(file_path)

data_list = pd.DataFrame({'class': img_class_list, 'path': img_path_list})

In [10]:
data_list.head(10)

Unnamed: 0,class,path
0,9,.//train/CALsuburb/image_0012.jpg
1,9,.//train/CALsuburb/image_0108.jpg
2,9,.//train/CALsuburb/image_0010.jpg
3,9,.//train/CALsuburb/image_0013.jpg
4,9,.//train/CALsuburb/image_0084.jpg
5,9,.//train/CALsuburb/image_0034.jpg
6,9,.//train/CALsuburb/image_0052.jpg
7,9,.//train/CALsuburb/image_0105.jpg
8,9,.//train/CALsuburb/image_0008.jpg
9,9,.//train/CALsuburb/image_0063.jpg


## Shuffle and split the list into training set and validation set

In [11]:
# 將資料清單隨機打亂
rand_seed = int(time())
data_list = shuffle(data_list, random_state=rand_seed)
data_list.head(10)

Unnamed: 0,class,path
2673,1,.//train/street/image_0085.jpg
29,9,.//train/CALsuburb/image_0055.jpg
179,7,.//train/PARoffice/image_0059.jpg
2697,1,.//train/street/image_0033.jpg
858,4,.//train/forest/image_0054.jpg
746,4,.//train/forest/image_0186.jpg
1183,2,.//train/industrial/image_0100.jpg
2075,6,.//train/opencountry/image_0214.jpg
1989,8,.//train/mountain/image_0123.jpg
1495,0,.//train/kitchen/image_0043.jpg


In [12]:
X = pd.DataFrame(data_list['path'], columns=['path'])
y = pd.DataFrame(data_list['class'], columns=['class'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

經過 train_test_split 切割出來的 train 及 test 資料，samples 跟 labels 清單的 index 順序是一樣的，可以安心拿來訓練 model。

In [13]:
print('Some training samples:\n----------------------')
print(X_train.head())

print('\nSome training labels:\n---------------------')
print(y_train.head())

print('\nSome validation samples:\n------------------------')
print(X_valid.head())

print('\nSome validation labels:\n-----------------------')
print(y_valid.head())

Some training samples:
----------------------
                                     path
2044  .//train/opencountry/image_0092.jpg
729        .//train/forest/image_0088.jpg
2335        .//train/store/image_0158.jpg
1080   .//train/industrial/image_0194.jpg
1828     .//train/mountain/image_0128.jpg

Some training labels:
---------------------
     class
2044     6
729      4
2335    11
1080     2
1828     8

Some validation samples:
------------------------
                                     path
900       .//train/highway/image_0160.jpg
1201   .//train/industrial/image_0021.jpg
1829     .//train/mountain/image_0149.jpg
2135  .//train/opencountry/image_0254.jpg
2380        .//train/store/image_0124.jpg

Some validation labels:
-----------------------
     class
900     14
1201     2
1829     8
2135     6
2380    11


把 DataFrame 存為 .csv 檔

In [14]:
# Save to csv
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_valid.to_csv('X_valid.csv', index=False)
y_valid.to_csv('y_valid.csv', index=False)

建立用來儲存模型的資料夾

In [15]:
path = 'saved_models'
if not os.path.exists('saved_models'):
    os.mkdir('saved_models')