In [1]:
import os
import cv2
import pandas as pd
from glob import glob

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from time import time

# 基本設定

In [2]:
data_dir_path = '/data/examples/may_the_4_be_with_u/where_am_i'

# 建立 target labels 清單

In [3]:
target_label_file_name = 'mapping.txt'
# /data/examples/may_the_4_be_with_u/where_am_i/mid_term_mapping.txt
target_label_file_path = '/'.join((data_dir_path, target_label_file_name))

with open(target_label_file_path) as f:
    all_lines = [line.split(', ') for line in f.read().splitlines()]

target_labels = dict()
for line in all_lines:
    target_class, target_label = line
    target_labels[target_class] = target_label

In [4]:
target_labels

{'CALsuburb': '9',
 'PARoffice': '7',
 'bedroom': '12',
 'coast': '10',
 'forest': '4',
 'highway': '14',
 'industrial': '2',
 'insidecity': '3',
 'kitchen': '0',
 'livingroom': '5',
 'mountain': '8',
 'opencountry': '6',
 'store': '11',
 'street': '1',
 'tallbuilding': '13'}

# 建立資料清單

In [5]:
# 指定存放 train 資料集的資料夾
train_dir = '/'.join((data_dir_path, 'train'))
img_path_list = []
img_class_list = []
for key in target_labels.keys():
    for file_path in glob('{}/{}/*.jpg'.format(train_dir, key)):
        img_class_list.append(target_labels[key])
        img_path_list.append(file_path)

data_list = pd.DataFrame({'class': img_class_list, 'path': img_path_list})

In [6]:
data_list.head(10)

Unnamed: 0,class,path
0,9,/data/examples/may_the_4_be_with_u/where_am_i/...
1,9,/data/examples/may_the_4_be_with_u/where_am_i/...
2,9,/data/examples/may_the_4_be_with_u/where_am_i/...
3,9,/data/examples/may_the_4_be_with_u/where_am_i/...
4,9,/data/examples/may_the_4_be_with_u/where_am_i/...
5,9,/data/examples/may_the_4_be_with_u/where_am_i/...
6,9,/data/examples/may_the_4_be_with_u/where_am_i/...
7,9,/data/examples/may_the_4_be_with_u/where_am_i/...
8,9,/data/examples/may_the_4_be_with_u/where_am_i/...
9,9,/data/examples/may_the_4_be_with_u/where_am_i/...


## Shuffle and split the list into training set and validation set

In [7]:
# 將資料清單隨機打亂
rand_seed = int(time())
data_list = shuffle(data_list, random_state=rand_seed)
data_list.head(10)

Unnamed: 0,class,path
2570,1,/data/examples/may_the_4_be_with_u/where_am_i/...
1151,2,/data/examples/may_the_4_be_with_u/where_am_i/...
226,7,/data/examples/may_the_4_be_with_u/where_am_i/...
641,4,/data/examples/may_the_4_be_with_u/where_am_i/...
1825,8,/data/examples/may_the_4_be_with_u/where_am_i/...
2704,1,/data/examples/may_the_4_be_with_u/where_am_i/...
953,14,/data/examples/may_the_4_be_with_u/where_am_i/...
543,10,/data/examples/may_the_4_be_with_u/where_am_i/...
186,7,/data/examples/may_the_4_be_with_u/where_am_i/...
2242,6,/data/examples/may_the_4_be_with_u/where_am_i/...


In [8]:
X = pd.DataFrame(data_list['path'], columns=['path'])
y = pd.DataFrame(data_list['class'], columns=['class'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25)

經過 train_test_split 切割出來的 train 及 test 資料，samples 跟 labels 清單的 index 順序是一樣的，可以安心拿來訓練 model。

In [9]:
print('Some training samples:\n----------------------')
print(X_train.head())

print('\nSome training labels:\n---------------------')
print(y_train.head())

print('\nSome validation samples:\n------------------------')
print(X_valid.head())

print('\nSome validation labels:\n-----------------------')
print(y_valid.head())

Some training samples:
----------------------
                                                   path
757   /data/examples/may_the_4_be_with_u/where_am_i/...
2469  /data/examples/may_the_4_be_with_u/where_am_i/...
2473  /data/examples/may_the_4_be_with_u/where_am_i/...
2262  /data/examples/may_the_4_be_with_u/where_am_i/...
684   /data/examples/may_the_4_be_with_u/where_am_i/...

Some training labels:
---------------------
     class
757      4
2469    11
2473    11
2262     6
684      4

Some validation samples:
------------------------
                                                   path
751   /data/examples/may_the_4_be_with_u/where_am_i/...
1220  /data/examples/may_the_4_be_with_u/where_am_i/...
2885  /data/examples/may_the_4_be_with_u/where_am_i/...
1312  /data/examples/may_the_4_be_with_u/where_am_i/...
463   /data/examples/may_the_4_be_with_u/where_am_i/...

Some validation labels:
-----------------------
     class
751      4
1220     2
2885    13
1312     3
463     10


把 DataFrame 存為 .csv 檔

In [10]:
# Save to csv
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
X_valid.to_csv('X_valid.csv', index=False)
y_valid.to_csv('y_valid.csv', index=False)