# Download Dataset

In [None]:
# https://data.caltech.edu/records/nyy15-4j048
!wget https://data.caltech.edu/records/nyy15-4j048/files/256_ObjectCategories.tar

In [None]:
!tar -xvf 256_ObjectCategories.tar

# Data Preparation

In [55]:
import os
import glob
from pathlib import Path
import pandas as pd

In [56]:
ROOT_DIR = "256_ObjectCategories"

Previosly I faced problem with result reproducing due to the order of file was changed every time.

So now we are sorting directory and files as well.

In [57]:
def get_file_number_for_sort(path):
    dir_name = Path(path).stem
    return int(dir_name[dir_name.index("_")+1:])

In [58]:
dataset_records = []

print("[NOTE]: It will print first 5 logs for senity check.")

dir_ls = os.listdir(ROOT_DIR)
dir_ls.sort(key=lambda x: int(x[:x.index(".")]))

for i, category_name in enumerate(dir_ls):
    
    file_paths = glob.glob(ROOT_DIR + "/" + category_name + "/*.*")
    file_paths.sort(key=lambda x: get_file_number_for_sort(x))
    
    if i < 5:
        print("Category Name:", category_name, ", Total file:", len(file_paths))
    
    for file_path in file_paths:
        dataset_records.append({
            "category_name" : category_name,
            "file_path" : file_path
        })
        
print("Total Records:", len(dataset_records))

[NOTE]: It will print first 5 logs for senity check.
Category Name: 001.ak47 , Total file: 98
Category Name: 002.american-flag , Total file: 97
Category Name: 003.backpack , Total file: 151
Category Name: 004.baseball-bat , Total file: 127
Category Name: 005.baseball-glove , Total file: 148
Total Records: 30607


In [59]:
df = pd.DataFrame(dataset_records)

In [60]:
df

Unnamed: 0,category_name,file_path
0,001.ak47,256_ObjectCategories/001.ak47/001_0001.jpg
1,001.ak47,256_ObjectCategories/001.ak47/001_0002.jpg
2,001.ak47,256_ObjectCategories/001.ak47/001_0003.jpg
3,001.ak47,256_ObjectCategories/001.ak47/001_0004.jpg
4,001.ak47,256_ObjectCategories/001.ak47/001_0005.jpg
...,...,...
30602,257.clutter,256_ObjectCategories/257.clutter/257_0823.jpg
30603,257.clutter,256_ObjectCategories/257.clutter/257_0824.jpg
30604,257.clutter,256_ObjectCategories/257.clutter/257_0825.jpg
30605,257.clutter,256_ObjectCategories/257.clutter/257_0826.jpg


## Train/Test split

In [61]:
from sklearn.model_selection import train_test_split

In [68]:
train_df, test_df = train_test_split(df, stratify=df.category_name, test_size=0.20, random_state=1)
train_df.shape, test_df.shape

((24485, 2), (6122, 2))

In [70]:
new_train_df, val_df = train_test_split(train_df, stratify=train_df.category_name, test_size=0.20, random_state=1)
new_train_df.shape, val_df.shape

((19588, 2), (4897, 2))

In [71]:
new_train_df.head()

Unnamed: 0,category_name,file_path
8292,077.french-horn,256_ObjectCategories/077.french-horn/077_0020.jpg
30526,257.clutter,256_ObjectCategories/257.clutter/257_0747.jpg
313,003.backpack,256_ObjectCategories/003.backpack/003_0119.jpg
13298,121.kangaroo-101,256_ObjectCategories/121.kangaroo-101/121_0073...
14940,134.llama-101,256_ObjectCategories/134.llama-101/134_0053.jpg


In [72]:
val_df.head()

Unnamed: 0,category_name,file_path
21836,192.snowmobile,256_ObjectCategories/192.snowmobile/192_0016.jpg
18862,161.photocopier,256_ObjectCategories/161.photocopier/161_0082.jpg
9976,092.grapes,256_ObjectCategories/092.grapes/092_0152.jpg
26920,239.washing-machine,256_ObjectCategories/239.washing-machine/239_0...
21585,189.snail,256_ObjectCategories/189.snail/189_0107.jpg


In [73]:
test_df.head()

Unnamed: 0,category_name,file_path
19600,169.radio-telescope,256_ObjectCategories/169.radio-telescope/169_0...
25539,228.triceratops,256_ObjectCategories/228.triceratops/228_0009.jpg
11621,105.horse,256_ObjectCategories/105.horse/105_0213.jpg
23553,208.swiss-army-knife,256_ObjectCategories/208.swiss-army-knife/208_...
16921,145.motorbikes-101,256_ObjectCategories/145.motorbikes-101/145_07...


In [75]:
new_train_df.to_csv("dataset/train.csv")
val_df.to_csv("dataset/val.csv")
test_df.to_csv("dataset/test.csv")

In [76]:
# Load And verify

In [77]:
load_train_df = pd.read_csv("dataset/train.csv")
load_train_df.shape

(19588, 3)

In [78]:
load_val_df = pd.read_csv("dataset/val.csv")
load_val_df.shape

(4897, 3)

In [79]:
load_test_df = pd.read_csv("dataset/test.csv")
load_test_df.shape

(6122, 3)