In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import glob
import json
import numpy
from tqdm.notebook import trange, tqdm
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
#import matplotlib.pyplot as plt
from numpy import vstack
from sklearn.metrics import accuracy_score
from PIL import Image
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
dir_path = "/home/ubuntu/capstone/HorizontalHFL/datasets/FEMNIST"

def getUsers():
  users = []
  path = dir_path + "/train/*.json"
  for file in tqdm(glob.glob(path)):
    # print("processing: " + file)
    with open(file) as f:
      dict_ = json.load(f);
      for user in dict_['users']:
        users.append(user)
  return users

def GetFile(path, clientID):
  for file in glob.glob(path):
    with open(file) as f:
      dict_ = json.load(f);
      for user in dict_['users']:
        if user == clientID:
          return file;

users = getUsers()
len(users)

  0%|          | 0/36 [00:00<?, ?it/s]

379

In [3]:
#train_data = torchvision.datasets.EMNIST('.', 'mnist', download='True', train='True', transform=transforms.ToTensor())
#test_data = torchvision.datasets.EMNIST('.', 'mnist', download='True', train='False', transform=transforms.ToTensor())


class FEMNISTDataset(Dataset):
  def __init__(self, test, clientID):
    self.images = []
    self.labels = []
    # Find file with the client ID, get its data
    if test:
      path = dir_path + "/test/*.json"
    else:
      path = dir_path + "/train/*.json"
    file = GetFile(path, clientID)
    with open(file) as f:
      dict = json.load(f)
      index = dict['users'].index(clientID)
      self.numSamples = dict['num_samples'][index]
      print(str(clientID) + " has " + str(self.numSamples) + " samples")
      self.images = dict['user_data'][clientID]['x']
      self.labels = dict['user_data'][clientID]['y']

  def __len__(self):
    return self.numSamples
  
  def __getitem__(self, idx):
    image = numpy.array(self.images[idx]).reshape(1,28,28)
    img_tensor = torch.from_numpy(image).float()
    label_id = torch.tensor(self.labels[idx])
    return img_tensor, label_id



train_data = FEMNISTDataset(False, users[80])
test_data = FEMNISTDataset(True, users[80])

trainLoader = DataLoader(train_data, batch_size=20, drop_last=True)
testLoader = DataLoader(test_data, batch_size=10, drop_last=False)

#img, label = train_data[2]
#plt.figure()
#plt.title(label)
#plt.axis("off")
#plt.imshow(img, cmap="gray")
#plt.show()

f0692_03 has 219 samples
f0692_03 has 25 samples


In [4]:
for ii, (img_, label_) in enumerate(testLoader):
    print(f'Batch Number: {ii}, Label: {label_}, Image size: {img_.shape}')

Batch Number: 0, Label: tensor([2, 3, 5, 7, 1, 8, 3, 9, 9, 6]), Image size: torch.Size([10, 1, 28, 28])
Batch Number: 1, Label: tensor([ 9,  6,  6, 61, 47, 42, 11, 28, 28, 47]), Image size: torch.Size([10, 1, 28, 28])
Batch Number: 2, Label: tensor([21, 11, 21, 35, 24]), Image size: torch.Size([5, 1, 28, 28])


In [5]:
class CustomCNN(nn.Module):
    def __init__(self):
        super(CustomCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 62)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output
        

In [6]:
def train_model(trainLoader, model, epochs):
    log_interval = 10
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.9)
    model.train()
    # enumerate epochs
    for epoch in trange(epochs):
        # print("starting epoch %d" % epoch)
        # enumerate mini batches
        for batch_idx, (inputs, targets) in enumerate(trainLoader):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            yhat = model(inputs)
            # calculate loss
            loss = criterion(yhat, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
            
        if epoch % log_interval == 0:
            print(f'Train Epoch: {epoch:<2} \tLoss: {loss.item():.6f}')
          # acc = evaluate_model(testLoader, model)
          # print('Accuracy: %.3f' % acc)


In [7]:
model = CustomCNN()
# print(model)
train_model(trainLoader, model, 200)

  0%|          | 0/200 [00:00<?, ?it/s]

Train Epoch: 0  	Loss: 4.096976
Train Epoch: 10 	Loss: 3.466539
Train Epoch: 20 	Loss: 2.308545
Train Epoch: 30 	Loss: 1.023801
Train Epoch: 40 	Loss: 1.155928
Train Epoch: 50 	Loss: 0.725960
Train Epoch: 60 	Loss: 1.292399
Train Epoch: 70 	Loss: 0.389874
Train Epoch: 80 	Loss: 0.777501
Train Epoch: 90 	Loss: 0.634902
Train Epoch: 100 	Loss: 1.529010
Train Epoch: 110 	Loss: 0.735609
Train Epoch: 120 	Loss: 1.138335
Train Epoch: 130 	Loss: 1.424874
Train Epoch: 140 	Loss: 1.079714
Train Epoch: 150 	Loss: 2.573204
Train Epoch: 160 	Loss: 1.359037
Train Epoch: 170 	Loss: 1.253479
Train Epoch: 180 	Loss: 1.148955
Train Epoch: 190 	Loss: 1.505768


In [8]:
# evaluate the model
def evaluate_model(testLoader, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(testLoader):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # round to class values
        yhat = yhat.argmax(axis=1)
        yhat = yhat.reshape((len(yhat), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate accuracy
    acc = accuracy_score(actuals, predictions)
    return acc

In [9]:
acc = evaluate_model(testLoader, model)
print('Accuracy: %.3f' % acc)

Accuracy: 0.400
