# import packages

In [1]:
# Numerical Operations
import math
import numpy as np

# For Progress Bar
from tqdm import tqdm


import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import os


# Dataset

### video data


### audio data
the whole audio is divided into shorted window frames and then converted into MFCCs.

In [2]:
class TinyM2NetDataset(Dataset):
    '''
    x: audio mfcc vector   44x13x1.
    y: image vector        32x32x3
    y: Targets:(cat,dog,duck,rabbit), if none, do prediction.
    '''
    def __init__(self, x,y,z=None):
        if y is None:
            self.z = z
        else:
            self.z = torch.FloatTensor(z)
        self.x = torch.FloatTensor(x)
        self.y = torch.FloatTensor(y)
    def __getitem__(self, idx):
        if self.z is None:
            return self.x[idx],self.y[idx]
        else:
            return self.x[idx], self.y[idx], self.z[idx]
    def __len__(self):
        return len(self.x)

# Neural Network Model

In [22]:
class SeparableConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
        super(SeparableConv2d, self).__init__()
        self.depthwise = nn.Conv2d(in_channels,in_channels,kernel_size,groups=in_channels,padding=1)
        self.pointwise = nn.Conv2d(in_channels,out_channels,kernel_size=kernel_size,padding=1)
        self.outlayer = nn.ReLU()
    def forward(self, x):
        out = self.depthwise(x)
        out = self.pointwise(out)
        out = self.outlayer(out)
        return out
    
class myConv2d(nn.Module):
    def __init__(self,input_channels,output_channels,kernel_size,dense_dim,bn_dim):
        super(myConv2d, self).__init__()
        self.conv2d = nn.Sequential(
            nn.Conv2d(input_channels,output_channels,kernel_size,padding=1),
            nn.BatchNorm2d(bn_dim),
            nn.ReLU()
        )
        self.spconv2d1 = nn.Sequential(
            SeparableConv2d(output_channels,32,kernel_size),
            nn.MaxPool2d((2,2)),
            nn.Dropout(0.2)
        )
        self.spconv2d2 = nn.Sequential(
            SeparableConv2d(32,output_channels,kernel_size),
            nn.MaxPool2d((2,2)),
            nn.Dropout(0.2)
        )
        self.outlayer = nn.Sequential(
            nn.Linear(dense_dim,output_channels),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
    def forward(self,x):
#         print(x.shape)
        out = self.conv2d(x)
#         print(out.shape)
        out = self.spconv2d1(out)
#         print(out.shape)
        out = self.spconv2d2(out)
#         print(out.shape)
        out = torch.flatten(out)
#         print(out.shape)
        out = self.outlayer(out)
#         print(out.shape)
        return out
    
class Tiny2Net(nn.Module):
    def __init__(self, labels,batch_size):
        super(Tiny2Net, self).__init__()
#         self.args = args
        self.videoNet = myConv2d(3,64,(3,3),4096*batch_size,64)  #(3,64,(3,3),4096,32)
        self.audioNet = myConv2d(1,64,(3,3),2112*batch_size,64) #(1,64,(3,3),2112,44)
        self.layer1 = nn.Sequential(
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(64,labels),
            nn.Softmax(dim=0)
        )
    def forward(self,x,y):
        """
        input x   MFCC Vector     size:  44x13x1
        input y   Image Vector   size: 32x32x3
        """
        x = self.audioNet(x)
        y = self.videoNet(y)
        z = torch.cat((x,y),0)
        z = self.layer1(z)
        z = self.layer2(z)
        return z

# Training Loop

In [45]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss(reduction='mean')
    
#     optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9) 
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=1e-5, amsgrad=False)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.975, last_epoch=-1, verbose=True)
    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') 
        
    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
    
    for epoch in range(n_epochs):
        model.train() # Set the model to train mode.
        loss_record = []

        train_pbar = tqdm(train_loader, position=0, leave=True)
        
        for x,y,z in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            
            x, y, z = x.to(device), y.to(device), z.to(device)  
            pred = model(x,y)       
            print("train: ",pred)
            loss = criterion(pred, z)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})
        scheduler.step()
        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set the model to evaluation mode.
        loss_record = []
        for x, y, z in valid_loader:
            x, y, z = x.to(device), y.to(device), z.to(device)  
            with torch.no_grad():
                pred = model(x,y)
                print("vaild: ",pred)
                loss = criterion(pred, z)

            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save the best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

# Configurations

In [46]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 42,   
    'valid_ratio': 0.5,
    'n_epochs': 10,        
    'batch_size': 3, 
    'learning_rate': 5e-3,              
    'early_stop': 3,    
    'save_path': './models/model.ckpt',  # model will be saved here.
    'data_path': './data/dog_train.npz'
}

# Dataloader

In [47]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def train_valid_split(x,y,z, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    valid_set_size = int(valid_ratio * len(x)) 
    train_set_size = len(x) - valid_set_size
    data_index = np.arange(len(x))
    train_index, valid_index = random_split(data_index, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    train_index, valid_index = np.array(train_index), np.array(valid_index)
#     print(train_index,valid_index)
    return x[train_index],y[train_index],z[train_index],x[valid_index],y[valid_index],z[valid_index]

def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x,y in tqdm(test_loader):
        x,y = x.to(device),y.to(device)                        
        with torch.no_grad():                   
            pred = model(x,y)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

In [48]:
#set seed
same_seed(config['seed'])
# load data
data = np.load(config['data_path'])
x = data["x"]
y = data["y"]
z = data["z"]
# if len(x)<4:
#     # for test, there is only 1 data in data.npz, so add one
#     x = np.concatenate((x, np.array([x[0]])))
#     y = np.concatenate((y, np.array([y[0]])))
#     z = np.concatenate((z, np.array([z[0]])))
#     x = np.concatenate((x, np.array([x[0]])))
#     y = np.concatenate((y, np.array([y[0]])))
#     z = np.concatenate((z, np.array([z[0]])))
#     x = np.concatenate((x, np.array([x[0]])))
#     y = np.concatenate((y, np.array([y[0]])))
#     z = np.concatenate((z, np.array([z[0]])))
print(x.shape)
print(y.shape)
print(z.shape)
print(len(x))

(30, 1, 44, 13)
(30, 3, 32, 32)
(30, 3)
30


In [49]:
print(z[0])

[0. 1. 0.]


In [50]:
train_x,train_y,train_z, valid_x,valid_y,valid_z = train_valid_split(x,y,z,config['valid_ratio'], config['seed'])
print(len(train_x))
print(len(valid_x))

15
15


In [51]:

train_dataset, valid_dataset = TinyM2NetDataset(train_x,train_y,train_z),  TinyM2NetDataset(valid_x,valid_y,valid_z)

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)

# Start training

In [52]:
print(config)
print(device)

{'seed': 42, 'valid_ratio': 0.5, 'n_epochs': 10, 'batch_size': 3, 'learning_rate': 0.005, 'early_stop': 3, 'save_path': './models/model.ckpt', 'data_path': './data/dog_train.npz'}
cuda


In [53]:
model = Tiny2Net(z.shape[1],config["batch_size"]).to(device)

In [54]:
trainer(train_loader, valid_loader, model, config, device)

Epoch [1/10]: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 54.38it/s, loss=0]
Epoch [2/10]:   0%|                                                                        | 0/5 [00:00<?, ?it/s, loss=0]

Adjusting learning rate of group 0 to 5.0000e-03.
train:  tensor([0.3261, 0.3461, 0.3278], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([0.0769, 0.9107, 0.0125], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([7.3856e-11, 1.0000e+00, 3.4147e-14], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
train:  tensor([5.8947e-30, 1.0000e+00, 4.1053e-35], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
train:  tensor([0.0000e+00, 1.0000e+00, 1.6816e-44], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
Adjusting learning rate of group 0 to 4.8750e-03.
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
Epoch [1/10]: Train loss: 0.0437, Valid loss: 0.0000
Saving model with loss 0.000...
train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([0., 1., 

Epoch [2/10]: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 67.04it/s, loss=0]
Epoch [3/10]: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 67.75it/s, loss=0]
Epoch [4/10]:   0%|                                                                        | 0/5 [00:00<?, ?it/s, loss=0]

train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Adjusting learning rate of group 0 to 4.7531e-03.
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
Epoch [2/10]: Train loss: 0.0000, Valid loss: 0.0000
train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Adjusting learning rate of group 0 to 4.6343e-03.
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device

Epoch [4/10]: 100%|████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 68.68it/s, loss=0]

train:  tensor([0., 1., 0.], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Adjusting learning rate of group 0 to 4.5184e-03.
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
vaild:  tensor([0., 1., 0.], device='cuda:0')
Epoch [4/10]: Train loss: 0.0000, Valid loss: 0.0000

Model is not improving, so we halt the training session.





In [44]:
%reload_ext tensorboard
%tensorboard --logdir=./runs/