In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import time
import torch
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
from model import CNNModel

%matplotlib inline

In [2]:
start_training_file = 'models/start_training3'
stop_training_file = 'models/stop_training3'
global_model_file = 'models/global_trainer3.pt'
local_model_file = 'models/trainer3_cnn.pt'

data_file = 'data/trainer3.csv'

In [3]:
df_train = pd.read_csv(data_file)
# df_train.head()
print('Labels in this training set: ', df_train['label'].unique())

The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.
Labels in this training set:  [9 6 7 8]


In [4]:
df_features = df_train.iloc[:, 1:785]
df_label = df_train.iloc[:, 0]

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(df_features, df_label, 
                                                      test_size = 0.2,
                                                      random_state = 1234)

In [6]:
X_train = np.array(X_train).reshape(X_train.shape[0], X_train.shape[1])
X_valid = np.array(X_valid).reshape(X_valid.shape[0], X_valid.shape[1])

In [7]:
X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
X_train  = torch.from_numpy(X_train).float()

y_train = torch.from_numpy(np.array(y_train))

In [8]:
print(X_train.shape, y_train.shape)

torch.Size([19186, 1, 28, 28]) torch.Size([19186])


In [9]:
X_valid = X_valid.reshape(X_valid.shape[0], 1, 28, 28)
X_valid = torch.from_numpy(X_valid).float()

y_valid = torch.from_numpy(np.array(y_valid))
print(X_valid.shape, y_valid.shape)

torch.Size([4797, 1, 28, 28]) torch.Size([4797])


In [10]:
batch_size = 100

# Pytorch train and test sets
train = torch.utils.data.TensorDataset(X_train, y_train)
valid = torch.utils.data.TensorDataset(X_valid, y_valid)

# data loader
train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
valid_loader = torch.utils.data.DataLoader(valid, batch_size = batch_size, shuffle = False)

In [11]:
# Define of hyperparameters for local training.
n_iters = 1500
num_epochs = n_iters / (len(X_train) / batch_size)
num_epochs = int(num_epochs)

print('Total Local Epochs: ', num_epochs)

error = nn.CrossEntropyLoss()

learning_rate = 0.001
model = CNNModel()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Total Local Epochs:  7


In [12]:
global_epoch = 1
wait_counter = 0

# Wait for instruction from coordinator
while(True):
    if os.path.exists(stop_training_file):
        os.remove(stop_training_file)
        print('Received Stop training from Coordinator. Exiting...')
        break

    if (wait_counter % 30 == 0):
        print('Waiting for global model...')

    time.sleep(1)
    wait_counter = wait_counter + 1
    
    if os.path.exists(start_training_file):
        os.remove(start_training_file)
        
        # model = CNNModel()
        model.load_state_dict(torch.load(global_model_file))
        os.remove(global_model_file)

        print('Received Global Model.')
        
        count = 0
        loss_list = []
        iteration_list = []
        accuracy_list = []

        for epoch in range(num_epochs):
            for i, (images, labels) in enumerate(train_loader):
                train = images
                labels = Variable(labels)
                
                # Clear gradients
                optimizer.zero_grad()
                
                # Forward propagation
                outputs = model(train)
                
                # Calculate softmax and ross entropy loss
                loss = error(outputs, labels)
                
                # Calculating gradients
                loss.backward()
                
                # Update parameters
                optimizer.step()

                count += 1
                if count % 50 == 0:
                    # Calculate Accuracy         
                    correct = 0
                    total = 0
                    # Iterate through validation dataset
                    for images, labels in valid_loader:                
                        #valid = Variable(images.view(100,1,28,28))
                        valid = images
                        # Forward propagation
                        outputs = model(valid)
                        # Get predictions from the maximum value
                        predicted = torch.max(outputs.data, 1)[1]

                        # Total number of labels
                        total += len(labels)
                        correct += (predicted == labels).sum()

                    accuracy = 100 * correct / float(total)

                    # store loss and iteration
                    loss_list.append(loss.data)
                    iteration_list.append(count)
                    accuracy_list.append(accuracy)
                if count % 50 == 0:
                    # Print Loss
                    print('Global epoch:{} Iteration: {}  Loss: {}  Accuracy: {} %'.format(global_epoch, count, loss.data, accuracy))
        
        print('Completed global epoch: ', global_epoch)
        
        global_epoch = global_epoch + 1
        
        torch.save(model.state_dict(), local_model_file)
        print('Saved local model file.')

Waiting for global model...
Received Global Model.
Global epoch:1 Iteration: 50  Loss: 0.12239894270896912  Accuracy: 93.24578094482422 %
Global epoch:1 Iteration: 100  Loss: 0.18824806809425354  Accuracy: 96.81050872802734 %
Global epoch:1 Iteration: 150  Loss: 0.24342495203018188  Accuracy: 97.18573760986328 %
Global epoch:1 Iteration: 200  Loss: 0.011505067348480225  Accuracy: 97.10235595703125 %
Global epoch:1 Iteration: 250  Loss: 0.020901383832097054  Accuracy: 97.9570541381836 %
Global epoch:1 Iteration: 300  Loss: 0.10124529153108597  Accuracy: 98.33229064941406 %
Global epoch:1 Iteration: 350  Loss: 0.05030046030879021  Accuracy: 98.51990509033203 %
Global epoch:1 Iteration: 400  Loss: 0.04335092008113861  Accuracy: 98.45736694335938 %
Global epoch:1 Iteration: 450  Loss: 0.03336590901017189  Accuracy: 98.31144714355469 %
Global epoch:1 Iteration: 500  Loss: 0.002189640887081623  Accuracy: 98.70752716064453 %
Global epoch:1 Iteration: 550  Loss: 0.010665645822882652  Accuracy:

Global epoch:4 Iteration: 650  Loss: 0.008141819387674332  Accuracy: 98.6032943725586 %
Global epoch:4 Iteration: 700  Loss: 0.004073319956660271  Accuracy: 98.9159927368164 %
Global epoch:4 Iteration: 750  Loss: 0.006425023544579744  Accuracy: 98.79090881347656 %
Global epoch:4 Iteration: 800  Loss: 0.0074178297072649  Accuracy: 98.97853088378906 %
Global epoch:4 Iteration: 850  Loss: 0.0024715771432965994  Accuracy: 98.77006530761719 %
Global epoch:4 Iteration: 900  Loss: 0.0011352479923516512  Accuracy: 98.8951416015625 %
Global epoch:4 Iteration: 950  Loss: 0.00519704120233655  Accuracy: 98.9159927368164 %
Global epoch:4 Iteration: 1000  Loss: 0.013355194590985775  Accuracy: 99.02021789550781 %
Global epoch:4 Iteration: 1050  Loss: 0.01854417473077774  Accuracy: 98.83260345458984 %
Global epoch:4 Iteration: 1100  Loss: 0.005673118866980076  Accuracy: 98.93683624267578 %
Global epoch:4 Iteration: 1150  Loss: 0.019603922963142395  Accuracy: 98.8951416015625 %
Global epoch:4 Iteration

Global epoch:7 Iteration: 1250  Loss: 0.00013844255590811372  Accuracy: 99.2495346069336 %
Global epoch:7 Iteration: 1300  Loss: 0.0010164399864152074  Accuracy: 99.20783996582031 %
Completed global epoch:  7
Saved local model file.
Waiting for global model...
Received Global Model.
Global epoch:8 Iteration: 50  Loss: 0.014825339429080486  Accuracy: 98.97853088378906 %
Global epoch:8 Iteration: 100  Loss: 0.027639493346214294  Accuracy: 98.83260345458984 %
Global epoch:8 Iteration: 150  Loss: 0.02110414020717144  Accuracy: 99.04106903076172 %
Global epoch:8 Iteration: 200  Loss: 0.0018484055763110518  Accuracy: 99.0619125366211 %
Global epoch:8 Iteration: 250  Loss: 0.026340117678046227  Accuracy: 99.10360717773438 %
Global epoch:8 Iteration: 300  Loss: 0.0004435527080204338  Accuracy: 98.81175994873047 %
Global epoch:8 Iteration: 350  Loss: 0.0018618060275912285  Accuracy: 99.16614532470703 %
Global epoch:8 Iteration: 400  Loss: 0.0009365890873596072  Accuracy: 99.1869888305664 %
Glob

Global epoch:11 Iteration: 450  Loss: 0.005782656837254763  Accuracy: 99.20783996582031 %
Global epoch:11 Iteration: 500  Loss: 0.0003513602423481643  Accuracy: 99.0619125366211 %
Global epoch:11 Iteration: 550  Loss: 0.00012648847769014537  Accuracy: 99.16614532470703 %
Global epoch:11 Iteration: 600  Loss: 0.01408204436302185  Accuracy: 98.97853088378906 %
Global epoch:11 Iteration: 650  Loss: 3.100178582826629e-05  Accuracy: 99.14530181884766 %
Global epoch:11 Iteration: 700  Loss: 0.000346334622008726  Accuracy: 99.14530181884766 %
Global epoch:11 Iteration: 750  Loss: 2.2139547581900842e-05  Accuracy: 99.1869888305664 %
Global epoch:11 Iteration: 800  Loss: 3.2318843295797706e-05  Accuracy: 99.1869888305664 %
Global epoch:11 Iteration: 850  Loss: 0.001517441007308662  Accuracy: 99.16614532470703 %
Global epoch:11 Iteration: 900  Loss: 0.00022821220045443624  Accuracy: 99.20783996582031 %
Global epoch:11 Iteration: 950  Loss: 0.012441695667803288  Accuracy: 98.97853088378906 %
Glob

KeyboardInterrupt: 

In [None]:
# df_train = pd.read_csv('data/trainer1.csv')
# df_features = df_train.iloc[:, 1:785]
# df_label = df_train.iloc[:, 0]
# X_train, X_valid, y_train, y_valid = train_test_split(df_features, df_label, 
#                                                       test_size = 0.2,
#                                                       random_state = 1234)
# X_valid = np.array(X_valid).reshape(X_valid.shape[0], X_valid.shape[1])

In [None]:
# sample = 10
# img = X_valid[sample] #shape (784,1)
# img = img.reshape(1, 1, 28, 28) #shape (1,1,28,28)
# img  = torch.from_numpy(img).float() #tensor

# prediction = model(img).detach().numpy()[0].argmax()
# print(prediction)

In [None]:
# fig = plt.figure
# plt.imshow(X_valid[sample].reshape(28,28), cmap='gray')
# plt.show()