In [1]:
import os
import h5py
import numpy as np
import torch
from torch.utils.data import TensorDataset

In [2]:
train_data = h5py.File('D:/data/FederatedEMNIST/fed_emnist_train.h5', 'r')
test_data = h5py.File('D:/data/FederatedEMNIST/fed_emnist_test.h5', 'r')

In [3]:
train_ids = list(train_data['examples'].keys())
test_ids = list(test_data['examples'].keys())
num_clients_train = len(train_ids)
num_clients_test = len(test_ids)
print(num_clients_train, num_clients_test)

3400 3400


In [4]:
num_classes = len(np.unique([train_data['examples'][train_ids[idx]]['label'][0] for idx in range(num_clients_train)]))
print(num_classes)

62


In [5]:
# local dataset
train_data_local_dict, train_data_local_num_dict = {}, {}
test_data_local_dict, test_data_local_num_dict = {}, {}

for client_idx in range(num_clients_train):
    client_id = train_ids[client_idx]

    train_x = train_data['examples'][client_id]['pixels'][()]
    train_y = train_data['examples'][client_id]['label'][()]
    local_data = TensorDataset(torch.tensor(train_x), torch.tensor(train_y, dtype=torch.long))
    train_data_local_dict[client_idx] = local_data
    train_data_local_num_dict[client_idx] = len(train_x)

    test_x = test_data['examples'][client_id]['pixels'][()]
    test_y = test_data['examples'][client_id]['label'][()]
    local_data = TensorDataset(torch.tensor(test_x), torch.tensor(test_y, dtype=torch.long))
    test_data_local_dict[client_idx] = local_data
    test_data_local_num_dict[client_idx] = len(test_x)
    if len(test_x) == 0:
        print(client_idx)

In [6]:
train_data.close()
test_data.close()

In [7]:
dataset = {}
dataset['train'] = {
    'data_sizes': train_data_local_num_dict,
    'data': train_data_local_dict,
}
dataset['test'] = {
    'data_sizes': test_data_local_num_dict,
    'data': test_data_local_dict,
}

In [8]:
import pickle
with open('D:/data/FederatedEMNIST/FederatedEMNIST_preprocessed.pickle', 'wb') as f:
    pickle.dump(dataset, f)

In [57]:
import os
import torchvision.datasets as D
from torch.utils.data import Subset
from tqdm import tqdm

In [23]:
train_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/Caltech256_train.txt')
test_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/Caltech256_test.txt')
with open(train_idx_file, 'r') as f:
    train_img_names = f.readlines()
with open(test_idx_file, 'r') as f:
    test_img_names = f.readlines()

train_img_names = [x.replace('\n','') for x in train_img_names]
test_img_names = [x.replace('\n','') for x in test_img_names]

In [59]:
img_path = 'D:/data/img_clf/Caltech256'
data = D.ImageFolder(root=img_path)

In [60]:
data.imgs[:3]

[('D:/data/img_clf/Caltech256\\001.ak47\\001_0001.jpg', 0),
 ('D:/data/img_clf/Caltech256\\001.ak47\\001_0002.jpg', 0),
 ('D:/data/img_clf/Caltech256\\001.ak47\\001_0003.jpg', 0)]

In [61]:
train_img_names.sort()
test_img_names.sort()
test_img_names[:3]

['001.ak47/001_0003.jpg', '001.ak47/001_0004.jpg', '001.ak47/001_0007.jpg']

In [49]:
img_name

'257.clutter/257_0827.jpg'

In [53]:
train_img_names[:5]

['001.ak47/001_0001.jpg',
 '001.ak47/001_0002.jpg',
 '001.ak47/001_0005.jpg',
 '001.ak47/001_0006.jpg',
 '001.ak47/001_0008.jpg']

In [63]:
train_indices, test_indices = [], []
for idx, (img, label) in tqdm(enumerate(data.imgs)):
    img_name = '/'.join(img.split('\\')[-2:])
    if img_name in train_img_names:
        train_indices.append(idx)
    else:
        test_indices.append(idx)

30607it [00:03, 8784.24it/s] 


In [64]:
len(train_indices), len(test_indices)

(22897, 7710)

In [71]:
with open('D:/data/img_clf/ExtractedFeatures/Caltech_train_idx.txt', 'w') as f:
    for x in train_indices:
        f.write(str(x)+',')

In [72]:
with open('D:/data/img_clf/ExtractedFeatures/Caltech_test_idx.txt', 'w') as f:
    for x in test_indices:
        f.write(str(x)+',')

In [27]:
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Subset
import torchvision.transforms as T
import torchvision.datasets as D
from tqdm import tqdm

In [28]:
### to normalize images
def get_img_mean(Dataset):
    loader = DataLoader(
        Dataset,
        batch_size=1000, num_workers=0, shuffle=False
    )

    mean = torch.zeros(3)
    mean2 = torch.zeros(3)
    total = torch.zeros(1)
    print('--> get mean&stdv of images')
    for data, _ in tqdm(loader):
        mean += torch.sum(data, dim=(0, 2, 3), keepdim=False)
        mean2 += torch.sum((data ** 2), dim=(0, 2, 3), keepdim=False)
        total += data.size(0)

    total *= (data.size(2) ** 2)
    mean /= total
    std = torch.sqrt((mean2 - total * (mean ** 2)) / (total - 1))

    mean = list(np.around(mean.numpy(), 4))
    std = list(np.around(std.numpy(), 4))
    return mean, std

In [8]:
import os

train_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/Caltech256_train_idx.txt')
test_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/Caltech256_test_idx.txt')

with open(train_idx_file, 'r') as f:
    train_indices = f.readlines()
with open(test_idx_file, 'r') as f:
    test_indices = f.readlines()

train_indices = list(map(int, train_indices[0].split(',')[:-1]))
test_indices = list(map(int, test_indices[0].split(',')[:-1]))
print(len(train_indices), len(test_indices))

22897 7710


In [30]:
transform = T.Compose([
    T.Resize((224,224)),
    T.ToTensor()
])

train_data = D.ImageFolder(root='D:/data/img_clf/Caltech256', transform=transform)
train_data = Subset(train_data, train_indices)

get_img_mean(train_data)

--> get mean&stdv of images


100%|██████████| 23/23 [04:12<00:00, 10.96s/it]


([0.5511, 0.5335, 0.5052], [0.3151, 0.3116, 0.3257])

In [1]:
import random

indices = [*range(22897 + 7710)]
random.seed(0)
random.shuffle(indices)
train_indices, test_indices = indices[:22897], indices[22897:]

In [4]:
import torchvision.datasets as D
from torch.utils.data import DataLoader, Subset

data = D.ImageFolder(root='D:/data/img_clf/Caltech256')
train_data = Subset(data, train_indices)

In [10]:
sorted(train_data.indices)

[0,
 1,
 3,
 5,
 6,
 9,
 10,
 11,
 13,
 15,
 17,
 18,
 19,
 20,
 22,
 24,
 25,
 27,
 28,
 29,
 30,
 31,
 32,
 35,
 36,
 39,
 41,
 43,
 44,
 46,
 47,
 50,
 52,
 54,
 55,
 56,
 57,
 58,
 61,
 62,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 73,
 74,
 75,
 76,
 78,
 79,
 81,
 82,
 83,
 84,
 85,
 87,
 88,
 89,
 91,
 92,
 93,
 95,
 97,
 98,
 99,
 102,
 103,
 104,
 106,
 107,
 108,
 109,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 123,
 124,
 125,
 127,
 128,
 129,
 130,
 131,
 132,
 134,
 136,
 137,
 140,
 141,
 142,
 144,
 145,
 146,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 156,
 157,
 158,
 159,
 160,
 162,
 163,
 165,
 166,
 167,
 169,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 180,
 181,
 182,
 183,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 195,
 197,
 198,
 199,
 200,
 201,
 202,
 204,
 205,
 206,
 207,
 208,
 209,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 220,
 221,
 222,
 224,
 227,
 228,
 230,
 234,
 236,
 237,
 238,
 239,
 241,
 243,
 244,
 245,
 247,
 

In [20]:
data_imgs = [imgs[0] for imgs in data.samples]
data_imgs[:3]

['D:/data/img_clf/Caltech256\\001.ak47\\001_0001.jpg',
 'D:/data/img_clf/Caltech256\\001.ak47\\001_0002.jpg',
 'D:/data/img_clf/Caltech256\\001.ak47\\001_0003.jpg']

In [26]:
train_indices.sort()
np.take(data_imgs, sorted(train_data.indices))

array(['D:/data/img_clf/Caltech256\\001.ak47\\001_0001.jpg',
       'D:/data/img_clf/Caltech256\\001.ak47\\001_0002.jpg',
       'D:/data/img_clf/Caltech256\\001.ak47\\001_0004.jpg', ...,
       'D:/data/img_clf/Caltech256\\257.clutter\\257_0825.jpg',
       'D:/data/img_clf/Caltech256\\257.clutter\\257_0826.jpg',
       'D:/data/img_clf/Caltech256\\257.clutter\\257_0827.jpg'],
      dtype='<U69')

In [25]:
train_img_names

['001.ak47/001_0001.jpg',
 '001.ak47/001_0002.jpg',
 '001.ak47/001_0005.jpg',
 '001.ak47/001_0006.jpg',
 '001.ak47/001_0008.jpg',
 '001.ak47/001_0009.jpg',
 '001.ak47/001_0010.jpg',
 '001.ak47/001_0011.jpg',
 '001.ak47/001_0012.jpg',
 '001.ak47/001_0013.jpg',
 '001.ak47/001_0014.jpg',
 '001.ak47/001_0015.jpg',
 '001.ak47/001_0016.jpg',
 '001.ak47/001_0017.jpg',
 '001.ak47/001_0018.jpg',
 '001.ak47/001_0019.jpg',
 '001.ak47/001_0021.jpg',
 '001.ak47/001_0022.jpg',
 '001.ak47/001_0024.jpg',
 '001.ak47/001_0027.jpg',
 '001.ak47/001_0028.jpg',
 '001.ak47/001_0030.jpg',
 '001.ak47/001_0031.jpg',
 '001.ak47/001_0032.jpg',
 '001.ak47/001_0033.jpg',
 '001.ak47/001_0035.jpg',
 '001.ak47/001_0036.jpg',
 '001.ak47/001_0038.jpg',
 '001.ak47/001_0039.jpg',
 '001.ak47/001_0040.jpg',
 '001.ak47/001_0041.jpg',
 '001.ak47/001_0042.jpg',
 '001.ak47/001_0043.jpg',
 '001.ak47/001_0045.jpg',
 '001.ak47/001_0047.jpg',
 '001.ak47/001_0048.jpg',
 '001.ak47/001_0050.jpg',
 '001.ak47/001_0051.jpg',
 '001.ak47/0

In [16]:
import numpy as np

In [9]:
train_indices

[0,
 1,
 4,
 5,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 20,
 21,
 23,
 26,
 27,
 29,
 30,
 31,
 32,
 34,
 35,
 37,
 38,
 39,
 40,
 41,
 42,
 44,
 46,
 47,
 49,
 50,
 51,
 53,
 54,
 56,
 61,
 62,
 64,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 76,
 77,
 78,
 79,
 80,
 81,
 84,
 85,
 86,
 87,
 88,
 89,
 93,
 94,
 95,
 98,
 99,
 100,
 101,
 102,
 104,
 107,
 111,
 112,
 113,
 117,
 119,
 120,
 121,
 122,
 123,
 125,
 126,
 128,
 129,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 141,
 142,
 143,
 144,
 145,
 147,
 149,
 150,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 163,
 164,
 165,
 166,
 168,
 169,
 176,
 177,
 178,
 180,
 181,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 193,
 195,
 197,
 198,
 200,
 201,
 202,
 205,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 218,
 219,
 220,
 221,
 222,
 224,
 225,
 228,
 230,
 231,
 232,
 234,
 236,
 237,
 238,
 239,
 240,
 241,
 243,
 244,
 246,
 247,
 249,
 251,
 252,
 253,
 254,
 255,
 256,
 25

In [1]:
img_path = 'C:/Users/unistmlv/OneDrive - UNIST/UNIST/MLV/KETI과제/코드/ssl_inpainting/images'

In [1]:
import numpy as np
from torch.utils.data import DataLoader, Dataset, TensorDataset, Subset
import torchvision.transforms as T
import torchvision.datasets as D
from scipy.io import loadmat
import os
from torch import Tensor

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataset_name, train_flag, transf):
        train_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/caltech256_train_idx.txt')
        test_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/caltech256_test_idx.txt')

        if os.path.exists(train_idx_file):
            with open(train_idx_file, 'r') as f:
                train_indices = f.readlines()
            train_indices = list(map(int, train_indices[0].split(',')[:-1]))
            with open(test_idx_file, 'r') as f:
                test_indices = f.readlines()
            test_indices = list(map(int, test_indices[0].split(',')[:-1]))

        data = D.ImageFolder(root='D:/data/img_clf/ExtractedFeatures/Caltech256', transform=transf)
        indices = train_indices if train_flag else test_indices
        self.dataset = Subset(data, indices)

    def __getitem__(self, index):
        data, target = self.dataset[index]
        return data, target, index

    def __len__(self):
        return len(self.dataset)

In [2]:
train_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/caltech256_train_idx.txt')
test_idx_file = os.path.join('D:/data/img_clf/ExtractedFeatures/caltech256_test_idx.txt')

if os.path.exists(train_idx_file):
    with open(train_idx_file, 'r') as f:
        train_indices = f.readlines()
    train_indices = list(map(int, train_indices[0].split(',')[:-1]))
    with open(test_idx_file, 'r') as f:
        test_indices = f.readlines()
    test_indices = list(map(int, test_indices[0].split(',')[:-1]))

In [5]:
data = D.ImageFolder(root='D:/data/img_clf/Caltech256/')
dataset = Subset(data, train_indices)

In [7]:
len(train_indices)

22897

In [9]:
dataset[22896]

(<PIL.Image.Image image mode=RGB size=293x251 at 0x1F57B05C6A0>, 256)