# Introduction

Notebook para fazer download de datasets de TFDS (Tensorflow Datasets).

Datasets baixados:
* CIFAR 10

# Import Libraries

In [1]:
import os
import cv2 
import shutil
import sklearn

import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

2022-07-13 21:26:05.378401: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


# Datasets Root Directory Path

In [2]:
DATASETS_ROOT_DIR = '/home/guilherme/data1/Dropbox/Link to Desktop/Doutorado/Datasets/'

# Utility Functions

In [9]:
def format_cifar10_split(ds):
    df = tfds.as_dataframe(ds)
    df_labels = pd.get_dummies(df.label)
    df = pd.concat([df, df_labels], axis=1)
    #df = df.drop(columns=['label'], inplace=False)
    df = df.rename(columns={x: f'n_{x}' for x in df.columns if type(x) is int}, inplace=False)
    df['img_name'] = [f'image_{x}' for x in range(ds.cardinality())]
    return df


def record_dataset(df, ds_split, split_name, ds_name):
    dir_path = os.path.join(f'{DATASETS_ROOT_DIR}', ds_name, split_name)
    shutil.rmtree(dir_path, ignore_errors=True)
    os.makedirs(dir_path, exist_ok=True)
    
    for idx,row in df.iterrows():        
        img_path = os.path.join(dir_path, row.img_name + '.jpg')
        cv2.imwrite(img_path, row.image)
        df.loc[idx, 'img_name'] = img_path
    
    split_labels_df = df[['img_name','id']+[f'n_{x}' for x in range(10)]]
    
    labels_dir_path = os.path.join(DATASETS_ROOT_DIR, ds_name)
    os.makedirs(labels_dir_path, exist_ok=True)
    
    labels_file_path = os.path.join(labels_dir_path, split_name + '_data.csv')
    split_labels_df.to_csv(labels_file_path, index=False)
                    

# CIFAR 10 Dataset

In [8]:
[ds_train, ds_valid, ds_test] = tfds.load('cifar10', split=['train[:80%]','train[80%:]','test'], shuffle_files=True)

df_train = format_cifar10_split(ds_train)
df_valid = format_cifar10_split(ds_valid)
df_test = format_cifar10_split(ds_test)
  
display(df_train.head())
display(df_valid.head())
display(df_test.head())

Unnamed: 0,id,image,label,n_0,n_1,n_2,n_3,n_4,n_5,n_6,n_7,n_8,n_9,img_name
0,b'train_01680',"[[[203, 214, 234], [191, 207, 226], [178, 200,...",8,0,0,0,0,0,0,0,0,1,0,image_0
1,b'train_17307',"[[[53, 53, 60], [63, 62, 66], [74, 72, 75], [8...",4,0,0,0,0,1,0,0,0,0,0,image_1
2,b'train_16399',"[[[143, 96, 70], [141, 96, 72], [135, 93, 72],...",7,0,0,0,0,0,0,0,1,0,0,image_2
3,b'train_27051',"[[[22, 23, 15], [15, 16, 8], [23, 24, 16], [42...",6,0,0,0,0,0,0,1,0,0,0,image_3
4,b'train_19135',"[[[115, 112, 59], [111, 108, 59], [110, 109, 5...",6,0,0,0,0,0,0,1,0,0,0,image_4


Unnamed: 0,id,image,label,n_0,n_1,n_2,n_3,n_4,n_5,n_6,n_7,n_8,n_9,img_name
0,b'train_09325',"[[[166, 188, 190], [166, 187, 189], [167, 189,...",0,1,0,0,0,0,0,0,0,0,0,image_0
1,b'train_33798',"[[[175, 176, 187], [142, 144, 157], [153, 155,...",4,0,0,0,0,1,0,0,0,0,0,image_1
2,b'train_14440',"[[[69, 81, 53], [69, 81, 52], [57, 69, 41], [5...",4,0,0,0,0,1,0,0,0,0,0,image_2
3,b'train_41397',"[[[60, 64, 65], [65, 69, 68], [64, 69, 65], [6...",7,0,0,0,0,0,0,0,1,0,0,image_3
4,b'train_10001',"[[[20, 15, 12], [20, 15, 12], [18, 13, 10], [1...",6,0,0,0,0,0,0,1,0,0,0,image_4


Unnamed: 0,id,image,label,n_0,n_1,n_2,n_3,n_4,n_5,n_6,n_7,n_8,n_9,img_name
0,b'test_09933',"[[[180, 186, 189], [161, 167, 168], [140, 146,...",7,0,0,0,0,0,0,0,1,0,0,image_0
1,b'test_05669',"[[[116, 190, 251], [116, 185, 243], [108, 177,...",0,1,0,0,0,0,0,0,0,0,0,image_1
2,b'test_00163',"[[[109, 86, 56], [91, 86, 54], [148, 144, 125]...",6,0,0,0,0,0,0,1,0,0,0,image_2
3,b'test_06686',"[[[213, 255, 253], [215, 254, 253], [218, 255,...",9,0,0,0,0,0,0,0,0,0,1,image_3
4,b'test_04747',"[[[28, 38, 17], [16, 22, 12], [16, 24, 12], [1...",5,0,0,0,0,0,1,0,0,0,0,image_4


## Record Images Files

In [10]:
record_dataset(df_train, ds_train, 'train', 'cifar_10')
record_dataset(df_valid, ds_valid, 'valid', 'cifar_10')
record_dataset(df_test, ds_test, 'test', 'cifar_10')