In [None]:
import torch
import torchvision
from torch.utils import data
import torch.optim as optim

import pandas as pd
import numpy as np

from datasets import SemiSupervisedMNIST
from models import Discriminator, Generator
from losses import DiscriminatorLoss, GeneratorLoss
from metrics import AverageAccuracy, FakeAccuracy, Loss, ClassAccuracy, RunTime
from history import History
from trainers import GAN_Trainer, Trainer

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
np.random.seed(876)
pd.np.random.seed(876)
torch.manual_seed(876)
if use_cuda:
    torch.cuda.manual_seed_all(876)

### [Option 1] Load MNIST dataset for semi-supervised learning

In [None]:
noise_size = 100
distribution = torch.distributions.normal.Normal(0, 1)     
label_encoding = {n: n for n in range(9)}
label_encoding['fake'] = len(label_encoding)
transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.1307,), (0.3081,)),
    torchvision.transforms.Lambda(lambda x: x.flatten().float())
])         
train_dataset = SemiSupervisedMNIST(num_labeled=10,
                                    noise_size=noise_size,
                                    distribution=distribution,
                                    label_encoding=label_encoding,
                                    root='./mnist_data', train=True, transform=transforms, download=False)
test_dataset = torchvision.datasets.MNIST(root='./mnist_data', train=False, transform=transforms, download=False)
test_dataset.label_encoding = label_encoding

### [Option 2] Load MREO haptics dataset
* Compact dataset (1 GB) (can be used to compute tables 1, 2, 3, 4, and 6): https://goo.gl/WiqSjJ
* https://github.com/Healthcare-Robotics/mr-gan

#### Parse data

In [None]:
import librosa

mel = True
modalities = ['temperature', 'force0', 'force1', 'contact']

data_dir = './haptics_data/'
data_files = os.listdir(data_dir)

In [None]:
raw = {}
materials = []
for file in data_files:
    path = os.path.join(data_dir, file)
    material = file.split('_')[2]
    materials.append(material)
    with open(path, 'rb') as pkl_file:
        c = pickle.load(pkl_file, encoding='latin1')
        raw[material] = c

In [None]:
d_material = []
d_obj = []
d_obj_sample_num = []
d_data = collections.defaultdict(list)
for material in materials:
    c = raw[material]
    for obj in c:
        for obj_sample_num in range(len(c[obj]['temperature'])):
            d_material.append(material)
            d_obj.append(obj)
            d_obj_sample_num.append(obj_sample_num)
            
            for modality in modalities:
                modality_data = c[obj][modality][obj_sample_num]
                if modality is 'contact' and mel:
                    S = librosa.feature.melspectrogram(np.array(modality_data), sr=48000, n_mels=128)
                    # Convert to log scale (dB)
                    log_S = librosa.amplitude_to_db(S, ref=np.max)
                    d_data[modality].append(log_S.flatten())
                else:
                    d_data[modality].append(modality_data)

In [None]:
d = dict(material=d_material, obj=d_obj, obj_sample_num=d_obj_sample_num)
for modality in modalities:
    d[modality] = d_data[modality]

In [None]:
df = pd.DataFrame(data=d)
data_i = list(range(len(df)))
print(df)

#### Supervised learning dataset example

In [None]:
from sklearn import preprocessing

train_i, test_i = train_test_split(data_i, test_size=0.2, stratify=df['material'].iloc[data_i])

# Scale data to zero mean and unit variance
for modality in modalities:
    scaler = preprocessing.StandardScaler()
    train_norm = scaler.fit_transform(np.stack(df[modality].iloc[train_i].values))
    test_norm = scaler.transform(np.stack(df[modality].iloc[test_i].values))
    
    df[modality].iloc[train_i] = train_norm.tolist()
    df[modality].iloc[test_i] = test_norm.tolist()
    
label_encoding = {m: i for i, m in enumerate(list(df['material'].unique()))}

datasets_i = {'train': train_i, 'test': test_i}
datasets = {l: MaterialDataset(modalities, label_encoding, df=df.iloc[i].reset_index().rename(columns={'index': 'sample_id'})) for l, i in datasets_i.items()}
phases = list(datasets.keys())

#### Semi-supervised leraning dataset

In [None]:
percent_unlabeled = 0.25
noise_size = 100
distribution = torch.distributions.normal.Normal(0, 1)

label_encoding = {m: i for i, m in enumerate(list(df['material'].unique()))}
label_encoding['fake'] = len(label_encoding)

datasets = {'train': SemiSupervisedMaterialDataset(modalities, 
                                                   label_encoding,
                                                   df=df.iloc[train_i].reset_index().rename(columns={'index': 'sample_id'}), 
                                                   percent_unlabeled=percent_unlabeled,
                                                   noise_size=noise_size,
                                                   distribution=distribution),
            'test': MaterialDataset(modalities, label_encoding, df.iloc[test_i].reset_index().rename(columns={'index': 'sample_id'}))
           }
phases = list(datasets.keys())

### Make dataloader

In [None]:
datasets = {'train': train_dataset,
            'test': test_dataset}
phases = list(datasets.keys())
dataloader_params = {'train': {'batch_size': 64, 'shuffle': True, 'num_workers': 8, 'pin_memory': use_cuda},
                    'test':  {'batch_size': 1, 'shuffle': False, 'num_workers': 8, 'pin_memory': use_cuda}
                    }
dataloaders = {l: data.DataLoader(d, **dataloader_params[l]) for l, d in datasets.items()}

### Define model

#### Supervised learning example with fully-connected network

In [None]:
net = Model_NN(datasets['train'].shape[0], datasets['train'].shape[1])
optimizer = optim.Adam(net.parameters(), lr=0.0006, betas=(0.5, 0.999))

metrics = [AverageAccuracy(), 
           ClassAccuracy(len(label_encoding)), 
           Loss(nn.CrossEntropyLoss(), name='CrossEntropy'), 
           Loss(nn.MSELoss(), name='MSE', output_transform=lambda y_pred, y: (y_pred, to_onehot(y, len(label_encoding)).float())),
           RunTime() 
          ]

viz_params = {
    'to_viz': True,
    'bands': False
}
history = History(metrics=metrics, viz_params=viz_params, phases=list(datasets.keys()))
t = Trainer(model=net,
            dataloaders=dataloaders, 
            optimizer=optimizer, 
            criterion=nn.CrossEntropyLoss(), 
            history=history)

#### Semi-supervised learning GAN

In [None]:
nets = {
    'D': Discriminator(datasets['train'].shape[0], datasets['train'].shape[1], feature_matching=True, leaky=0.2),
    'G': Generator(noise_size, datasets['train'].shape[0])
}
optimizers = {
    'D': optim.Adam(nets['D'].parameters(), lr=0.0006, betas=(0.5, 0.999)),
    'G': optim.Adam(nets['G'].parameters(), lr=0.0006, betas=(0.5, 0.999)),
}
    
criterions = {
    'D': DiscriminatorLoss(return_all=True),
    'G': GeneratorLoss()
}
metrics = {
    'D': [AverageAccuracy(),
          FakeAccuracy(output_transform=lambda x: (x[0], x[1].long())),
          Loss(criterions['D'], name='loss_D'),
          Loss(criterions['D'], name='loss_labeled'),
          Loss(criterions['D'], name='loss_unlabeled'),
          ClassAccuracy(len(label_encoding)),
          RunTime()],
    'G': [FakeAccuracy(output_transform=lambda x: (x[0], x[1].long())),
          Loss(criterions['G'], name='loss_G'),
          RunTime()],
}

In [None]:
viz_params = {
    'D': {
        'to_viz': True,
        'bands': False
    },
    'G': {
        'to_viz': False,
        'bands': False
    }
}
history = {
    'D': History(metrics=metrics['D'], viz_params=viz_params['D'], phases=list(datasets.keys()), verbose=0),
    'G': History(metrics=metrics['G'], viz_params=viz_params['G'], phases=list(datasets.keys()), verbose=0),
}

In [None]:
t = GAN_Trainer(model=nets,
                dataloaders=dataloaders, 
                optimizer=optimizers,
                criterion=criterions, 
                history=history,
                device=device)

### Visualizations

In [None]:
from bokeh.io import output_notebook
from bokeh.plotting import show
from bokeh.layouts import gridplot

In [None]:
output_notebook()
grid_D = history['D'].viz()
history['D'].viz_handle = show(gridplot(grid_D), notebook_handle=True)

### Training and evaluating model

In [None]:
# t.load('checkpoints/checkpoint_50.pt')
t.run(max_epoch=0)

In [None]:
history['G'].to_df()

In [None]:
print('discriminator history:')
print(history['D'].to_df())
print('-' * 100)
print('generator history:')
print(history['G'].to_df())

In [None]:
t.save()