Feature scaling - data normalization - gets higher accuracy faster - not always the right thing to do 

#
When we normalize a dataset, we typically encode some form of information about each particular value relative to the dataset at large and rescale the data


When we normalize a data we're rescaling the data

The idea of data normalization is an general concept that refers to the act of transforming the original values of a dataset to new values. The new values are typically encoded relative to the dataset itself and are scaled in some way.

Feature scaling : 
#
For this reason, another name for data normalization that is sometimes used is feature scaling. This term refers to the fact that when normalizing data, we often transform different features of a given dataset to a similar scale.
#
In this case, we are not just thinking of a dataset of values but rather, a dataset of elements that have multiple features, each with their on value.

Suppose for example that we are dealing with a dataset of people, and we have two relevant features in our dataset, age and weight. In this case, we can observe that the magnitudes or scales of these these two feature sets are different, i.e., the weights on average ar larger than the age.
#
This difference in magnitude can be problematic when comparing or computing using machine learning algorithms. Hence, this can be one reason we might want to scale the values of these features to some similar scale via feature scaling.

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim 
import torchvision
import torchvision.transforms as transforms

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import json

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter #allows to send data to tensorboard files 
from IPython.display import display, clear_output

from collections import OrderedDict
from collections import namedtuple
from itertools import product

torch.set_printoptions(linewidth = 120) 
torch.set_grad_enabled(True) 

<torch.autograd.grad_mode.set_grad_enabled at 0x2bfd71f60d0>

In [7]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data'  
    ,train=True    
    ,download=True 
    ,transform=transforms.Compose([
        transforms.ToTensor()
        #normalize
    ])
)

In [6]:
#Easy way

loader = DataLoader(train_set, batch_size = len(train_set), num_workers = 1)
data = next(iter(loader))
data[0].mean(), data[0].std()



(tensor(0.2861), tensor(0.3530))

In [9]:
#Harder way (when data is too big to fit)

loader = DataLoader(train_set, batch_size = 5, num_workers = 1)

num_of_pixels = len(train_set)*28*28
 
total_sum = 0

for batch in loader :
     total_sum += batch[0].sum()

mean = total_sum/num_of_pixels

sum_of_squared_error = 0
for batch in loader : sum_of_squared_error += ((batch[0] - mean).pow(2)).sum()
std = torch.sqrt(sum_of_squared_error / num_of_pixels)

mean, std 

(tensor(0.2860), tensor(0.3530))

In [11]:
plt.hist(data[0].flatten())
plt.axvline(data[0].mean())

KeyboardInterrupt: 

Using the mean and std values 

In [None]:
train_set_normal = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, 
                                                     transform=transforms.Compose([transforms.ToTensor(),transforms.Normalize(mean, std)]))

In [None]:
loader = DataLoader(train_set, batch_size = len(train_set), num_workers = 1)
data = next(iter(loader))
data[0].mean(), data[0].std()

In [None]:
plt.hist(data[0].flatten())
plt.axvline(data[0].mean())

#in below mean is zero (the line in the figure)

In [None]:
class RunBuilder():
    @staticmethod #The main thing to note about using this class is that it has a static method called get_runs(). This method will get the runs for us that it builds based on the parameters we pass in.
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs
        

In [None]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [None]:
class Network(nn.Module):
        
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)

        self.fc1 = nn.Linear(in_features=12 * 4 * 4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)


    def forward(self, t):
        
        # (1) input layer
        t = t
        
        # (2) hidden conv layer
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size = 2, stride =2)

        # (3) hidden conv layer
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size = 2, stride =2)

        # (4) hidden linear layer
        t = t.reshape(-1, 12*4*4)
        t = self.fc1(t)
        t = F.relu(t)
       
        # (5) hidden linear layer
        t = self.fc2(t)
        t = F.relu(t) 

        # (6) ouput layer
        t = self.out(t)
        return t

In [None]:
#Extract a class - refactoring techniques 

class Epoch():
    def __init__(self):
        self.count = 0
        self.loss = 0
        self.num_correct = 0
        self.start_time = None 

#Then, we'll replace these class variable with an instance of the Epoch class.
#We might even change the count variable to have a more intuitive name, like say number or id.

In [None]:

class RunManager():

    def __init__(self):

        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.epoch_start_time = None

        self.run_params = None 
        self.run_count = 0
        self.run_data = []  #track param,results for each epoch for each run
        self.run_start_time = None  #run durations

        self.network = None #save network
        self.loader = None #save data loader for run
        self.tb = None #summarywriter for tensorboard


#Anytime we see this, we need to be thinking about removing these prefixes. (epoch_count, epoch_loss, ....)
# Data that belongs together should be together. 
# This is done by encapsulating the data inside of a class.
#this is done in next cell

    def begin_run(self, run, network, loader):

        self.run_start_time = time.time()

        self.run_params = run
        self.run_count += 1

        self.network = network
        self.loader = loader
        self.tb = SummaryWriter(comment=f'-{run}')

        images, labels = next(iter(self.loader))
        grid = torchvision.utils.make_grid(images)

        self.tb.add_image('images', grid)
        self.tb.add_graph(self.network, images.to(getattr(run, 'device', 'cpu')))     ####### getattr is modified - check which device the run for 

    def end_run(self):
        self.tb.close()
        self.epoch_count = 0

    def begin_epoch(self):
        self.epoch_start_time = time.time()

        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0

    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time

        loss = self.epoch_loss / len(self.loader.dataset)
        accuracy = self.epoch_num_correct / len(self.loader.dataset)

        self.tb.add_scalar('Loss', loss, self.epoch_count)
        self.tb.add_scalar('Accuracy', accuracy, self.epoch_count)

        for name, param in self.network.named_parameters():
            self.tb.add_histogram(name, param, self.epoch_count)
            self.tb.add_histogram(f'{name}.grad', param.grad, self.epoch_count)

        results = OrderedDict()
        results['run'] = self.run_count
        results['epoch'] = self.epoch_count
        results['loss'] = loss
        results['accuracy'] = accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k,v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        
        clear_output(wait=True) #specific to jy notebooks clear curr o/p and display new data frame
        display(df)
    
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * batch[0].shape[0]

    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
    
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()

    @torch.no_grad()

    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName):

        pd.DataFrame.from_dict(self.run_data, orient='columns').to_csv(f'{fileName}.csv')
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:json.dump(self.run_data, f, ensure_ascii=False, indent=4)


In [None]:
trainsets = {
    'not_normal' : train_set,
    'normal' : train_set_normal
}

In [None]:
params = OrderedDict(
    lr = [.01]
    ,batch_size = [1000, 10000, 20000]
    ,num_workers = [0,1]
    ,device = ['cuda', 'cpu']   
    ,trainset = ['not_normal', 'normal']
)

m = RunManager()

for run in RunBuilder.get_runs(params):
    
    device = torch.device(run.device) ## create a pytroch device and pass cpu or cuda
    network = Network().to(device) #initialize network on cuda
    train_loader = DataLoader(train_sets[run.train_set], batch_size = run.batch_size, num_workers = run.num_workers)
    optimizer = optim.Adam(network.parameters(), lr=run.lr)

    m.begin_run(run, network, train_loader)

    for epoch in range(1):
        m.begin_epoch()
        for batch in train_loader:

                images = batch[0].to(device) #unpack them & send to device 
                labels = batch[1].to(device) #unpack them & send to device
                preds = network(images) 
                loss = F.cross_entropy(preds, labels) 
                optimizer.zero_grad()               
                loss.backward()          
                optimizer.step()
                
                m.track_loss(loss)
                m.track_num_correct(preds, labels)

        m.end_epoch()
    m.end_run()

m.save('results')

In [None]:
pd.DataFrame.from_dict(m.run_data).sort_values('accuracy', ascending = False)

