# Learnings

1. Increase sequence length to have greater understanding of long term patterns in music
2. Increase batch size to leverage on GPU

Goal is to achieve validation accuracy of about 50%

In [1]:
from __future__ import print_function
from math import ceil
import numpy as np
import sys
import pdb

import torch
import torch.optim as optim
import torch.nn as nn

import generator
import discriminator
import helpers

import random

# Define Hyperparameters

In [2]:
CUDA = False

MAX_SEQ_LEN = 256          # try longer
POS_NEG_SAMPLES = 40000   # try max of 500000
# VOCAB_SIZE = 5000

START_LETTER = 0
BATCH_SIZE = 128

GEN_EMBEDDING_DIM = 32
GEN_HIDDEN_DIM = 32
DIS_EMBEDDING_DIM = 64
DIS_HIDDEN_DIM = 64

MLE_TRAIN_EPOCHS = 50    # original: 100, run01: 20, run02: 50
ADV_TRAIN_EPOCHS = 30    # original: 50 

DIS_D_STEPS = 3          # original: 50, run01: 5, run02: 3
ADV_D_STEPS = 3          # original: 5
DIS_EPOCH = 3            # original: 3

N_SAMPLES = 3

# use the following settings for rapid dev and testing
# POS_NEG_SAMPLES = 1000
# MLE_TRAIN_EPOCHS = 1
# ADV_TRAIN_EPOCHS = 1
# DIS_D_STEPS = 1
# ADV_D_STEPS = 1
# DIS_EPOCH = 1

## Modification 1

oracle_samples is a tensor of 10,000 sequences each 20 integers long
where each integer represents a token in a vocabulary size of 5000

1. replace oracle_samples with mozart_text with size (10000,20) in 10,000 sequences of 20 integers long, each an intger
    - add function to helper file
2. change the following hyperparameters:
    - BATCH_SIZE
    - MAX_SEQ_LEN


# Load True Distribution Generator
# Load True Distribution Data Sample

In [3]:
mozart_data = "./data/mozart.txt"

VOCAB_SIZE, word2int, int2word, encoded_data = helpers.load_music_file(mozart_data)

# returns list of (567850,20)
real_data_samples = helpers.batch_music_samples(encoded_data, MAX_SEQ_LEN)

In [4]:
len(real_data_samples), len(real_data_samples[0])

(44363, 256)

In [5]:
# returns tensor of (40000, 256) and (4000, 256)
real_train, real_val = helpers.train_val_split(real_data_samples, POS_NEG_SAMPLES)

## Modification 1 complete

At this point:
1. the mozart data has been ingested
2. it has been prepared for train (500000, 20) and val (67850, 20)
3. hyperparameters have been modified to accept a larger dataset and smaller vocab size


May have to change the starting letter?

Why do I need to know oracle loss? To compare the difference between the oracle and the generator?

How fast does it train on the GPU?

# Build Fake Generator
# Build Discriminator

In [6]:
gen = generator.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
dis = discriminator.Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)

# Cast tensors to GPU is available

In [7]:
if CUDA:
    gen = gen.cuda()
    dis = dis.cuda()
    real_train = real_train.cuda()
    real_val = real_val.cuda()

## Modification 2

1. Remove the calculation of Oracle Loss
    - Altnernatively, change how the oracle samples or generates data for batch NLLLoss calculation


# Step 1: Train Generator MLE

In [None]:
def train_generator_MLE(gen, gen_opt, real_data_samples, epochs):
    """
    Max Likelihood Pretraining for the generator
    """
    for epoch in range(epochs):
        
        # just an log for Generator MLE training
        
        print('epoch %d : ' % (epoch + 1), end='')
        sys.stdout.flush()
        total_loss = 0
        
        # real_data_samples is oracles_samples
        # POS_NEG_SAMPLES = sample size = 10,000
        # BATCH_SIZE = 32
        # so i will be [0, 32, 64, 96, 128, ...]

        for i in range(0, POS_NEG_SAMPLES, BATCH_SIZE):
            
            """
            each input and target is size (32, 20) = (BATCH_SIZE, MAX_SEQ_LEN)
            
            
            FOR EXAMPLE:
            
            -INPUT-
            
            start_letter = 0
            
            samples = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
            
            
            -OUTPUTS-
            
            input = [[0, 11, 12, 13],
                     [0, 15, 16, 17],
                     [0, 19, 20, 21]]
            
            target = [[11, 12, 13, 14],
                      [15, 16, 17, 18],
                      [19, 20, 21, 22]]
                      
            REAL EXAMPLE:
            
            start_letter: 0
            input: tensor([   0, 4766,  468, 2145,  938, 2625,   23, 1038, 2449, 2065, 3364,  429,
                           2323,  784, 2985, 2985,  203, 2912, 2707, 1370])
            target: tensor([4766,  468, 2145,  938, 2625,   23, 1038, 2449, 2065, 3364,  429, 2323,
                            784, 2985, 2985,  203, 2912, 2707, 1370, 2515])
            
            """
            
            inp, target = helpers.prepare_generator_batch(real_data_samples[i:i + BATCH_SIZE], start_letter=START_LETTER,
                                                          gpu=CUDA)

            """
            in order:
            
            1. set gradients to be zero
            2. compute training loss
            3. backpropagate gradient
            4. update weights of nn
            """
            
            gen_opt.zero_grad()
            loss = gen.batchNLLLoss(inp, target)
            loss.backward()
            gen_opt.step()

            total_loss += loss.data.item()

            if (i / BATCH_SIZE) % ceil(
                            ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                print('.', end='')
                sys.stdout.flush()

        # each loss in a batch is loss per sample
        total_loss = total_loss / ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / MAX_SEQ_LEN

        print(' average_train_NLL = %.4f' % (total_loss))

## Modification 3
1. Change how positive validation (real data) is sampled
    - create a function to choose from the positive validation
    
2. Understand how def batchwise_oracle_nll works 

In [None]:
# GENERATOR MLE TRAINING
print('Starting Generator MLE Training...')
gen_optimizer = optim.Adam(gen.parameters(), lr=1e-2)
train_generator_MLE(gen, gen_optimizer, real_train, MLE_TRAIN_EPOCHS)

Starting Generator MLE Training...
epoch 1 : .......... average_train_NLL = 2.6763
epoch 2 : .......... average_train_NLL = 2.0054
epoch 3 : .......... average_train_NLL = 1.8825
epoch 4 : ............ average_train_NLL = 1.7649
epoch 8 : .......... average_train_NLL = 1.7525
epoch 9 : .......... average_train_NLL = 1.7435
epoch 10 : .......... average_train_NLL = 1.7351
epoch 11 : .......... average_train_NLL = 1.7281
epoch 12 : .......... average_train_NLL = 1.7226
epoch 13 : .......... average_train_NLL = 1.7171
epoch 14 : .......... average_train_NLL = 1.7122
epoch 15 : .......... average_train_NLL = 1.7071
epoch 16 : .......... average_train_NLL = 1.7025
epoch 17 : .......... average_train_NLL = 1.6985
epoch 18 : .......... average_train_NLL = 1.6947
epoch 19 : .......... average_train_NLL = 1.6913
epoch 20 : .......... average_train_NLL = 1.6895
epoch 21 : .......... average_train_NLL = 1.6869
epoch 22 : .......... average_train_NLL = 1.6834
epoch 23 : .......... average_train_NL

# Step 2: Train Discriminator

In [None]:
def train_discriminator(discriminator, dis_opt, real_data_samples, generator, real_val, d_steps, epochs):
    """
    Training the discriminator on real_data_samples (positive) and generated samples from generator (negative).
    Samples are drawn d_steps times, and the discriminator is trained for epochs epochs.
    """
    
    """
    PREPARE VALIDATION SET BEFORE TRAINING
    
    pos_val: tensor of (100, 20) = (BATCH_SIZE, SEQ_LEN) 
    data sample generated by the TRUE distribution
    
    neg_val: tensor of (100, 20) = (BATCH_SIZE, SEQ_LEN) 
    data sample generated by the FAKE distribution
    
    val_input: tensor of (200, 20)
    data sample of pos_val and neg_val joined
    
    val_target: tensor of (200)
    binary label for val_input indicating which is TRUE and which is FAKE
    
    EXAMPLE:
    
    pos_val: tensor([3919, 3055,  295,  221, 3468,  973, ...])
    neg_val: tensor([4542, 2385, 2421, 4289,  135, 4437, ...])
    val_inp: tensor([3784, 4782, 2792, 4431, 3654, 2415, ...])
    val_target: tensor([0., 1., 1., 0., 1., 0., 1., 0., 0., 0., ...])
    
    """

    # generating a small validation set before training (using oracle and generator)
#     pos_val = oracle.sample(100)
    pos_val = helpers.positive_sample(real_val, 100)
    neg_val = generator.sample(100, MAX_SEQ_LEN)
    val_inp, val_target = helpers.prepare_discriminator_data(pos_val, neg_val, gpu=CUDA)

    for d_step in range(d_steps):
        
        """
        generate 10,000 FAKE data samples of length 20 sequences
        s: tensor of (10000, 20)
        
        concatenate with TRUE data samples from oracle
        real_data_samples: tensor of (10000, 20)
        
        then shuffle with the binary labels to get:
        dis_inp: tensor of (20000, 20)
        dis_target: tensor of (20000) containing 1 or 0 for TRUE of FAKE data
        
        """
        
        s = helpers.batchwise_sample(generator, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN)
        
        dis_inp, dis_target = helpers.prepare_discriminator_data(real_data_samples, s, gpu=CUDA)
        
        for epoch in range(epochs):
            print('d-step %d epoch %d : ' % (d_step + 1, epoch + 1), end='')
            sys.stdout.flush()
            total_loss = 0
            total_acc = 0
            
            # loop through all 20,0000 data samples

            for i in range(0, 2 * POS_NEG_SAMPLES, BATCH_SIZE):
                
                # subset input based on batch size
                inp, target = dis_inp[i:i + BATCH_SIZE], dis_target[i:i + BATCH_SIZE]
                
                # zero all gradients
                dis_opt.zero_grad()
                
                # get binary output from discriminator based on input
                out = discriminator.batchClassify(inp)
                
                # re-initialize loss function
                loss_fn = nn.BCELoss()
                
                # calculate loss based on output and target
                loss = loss_fn(out, target)
                
                # backpropagate
                loss.backward()
                
                # update weights
                dis_opt.step()

                total_loss += loss.data.item()
                total_acc += torch.sum((out>0.5)==(target>0.5)).data.item()

                if (i / BATCH_SIZE) % ceil(ceil(2 * POS_NEG_SAMPLES / float(
                        BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                    print('.', end='')
                    sys.stdout.flush()

            total_loss /= ceil(2 * POS_NEG_SAMPLES / float(BATCH_SIZE))
            total_acc /= float(2 * POS_NEG_SAMPLES)

            val_pred = discriminator.batchClassify(val_inp)
            print(' average_loss = %.4f, train_acc = %.4f, val_acc = %.4f' % (
                total_loss, total_acc, torch.sum((val_pred>0.5)==(val_target>0.5)).data.item()/200.))

In [None]:
# PRETRAIN DISCRIMINATOR
print('\nStarting Discriminator Training...')
dis_optimizer = optim.Adagrad(dis.parameters())
train_discriminator(dis, dis_optimizer, real_train, gen, real_val, DIS_D_STEPS, DIS_EPOCH)


Starting Discriminator Training...
d-step 1 epoch 1 : .......... average_loss = 0.5715, train_acc = 0.6957, val_acc = 0.7600
d-step 1 epoch 2 : .......... average_loss = 0.3679, train_acc = 0.8379, val_acc = 0.9200
d-step 1 epoch 3 : .......... average_loss = 0.1937, train_acc = 0.9236, val_acc = 0.9600
d-step 2 epoch 1 : .......... average_loss = 0.1220, train_acc = 0.9536, val_acc = 0.9700
d-step 2 epoch 2 : .......... average_loss = 0.0781, train_acc = 0.9713, val_acc = 0.9750
d-step 2 epoch 3 : .......... average_loss = 0.0569, train_acc = 0.9796, val_acc = 0.9850
d-step 3 epoch 1 : .......... average_loss = 0.0503, train_acc = 0.9817, val_acc = 0.9800
d-step 3 epoch 2 : .......... average_loss = 0.0389, train_acc = 0.9862, val_acc = 0.9900
d-step 3 epoch 3 : .......... average_loss = 0.0317, train_acc = 0.9889, val_acc = 0.9850


In [None]:
def train_generator_PG(gen, gen_opt, dis, num_batches):
    """
    The generator is trained using policy gradients, using the reward from the discriminator.
    Training is done for num_batches batches.
    """

    for batch in range(num_batches):
        
        """
        1. get generator to generate samples = s (a batch of 64 sequences of length 20)
        2. prepare input (pre-pended target) and target based on generated samples(same as train_generator_MLE)
        3. get discriminator to return a sigmoid value between 0 and 1 on whether generated samples are REAL or FAKE
        4. reward is a tensor of size (64) containing the DIS sigmoid value of every sequence in the batch
        5. the sigmoid values in reward is used as a multiplier in the loss function
            a. if value is near 0 (DIS classified as FAKE) then loss is small (not so negative)
            b. if value is near 1 (DIS classified as REAL) then loss is large (very negative)
        """
        
        s = gen.sample(BATCH_SIZE*2, MAX_SEQ_LEN)        # 64 works best
        inp, target = helpers.prepare_generator_batch(s, start_letter=START_LETTER, gpu=CUDA)
        rewards = dis.batchClassify(target)
        
        gen_opt.zero_grad()
        pg_loss = gen.batchPGLoss(inp, target, rewards)
        pg_loss.backward()
        gen_opt.step()

    # sample from generator and compute oracle NLL
#     oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
#                                                    start_letter=START_LETTER, gpu=CUDA)

#     print(' oracle_sample_NLL = %.4f' % oracle_loss)

In [None]:
# ADVERSARIAL TRAINING
print('\nStarting Adversarial Training...')
# oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
#                                            start_letter=START_LETTER, gpu=CUDA)
# print('\nInitial Oracle Sample Loss : %.4f' % oracle_loss)


for epoch in range(ADV_TRAIN_EPOCHS):
    print('\n--------\nEPOCH %d\n--------' % (epoch+1))
    # TRAIN GENERATOR
    print('\nAdversarial Training Generator: ', end='')
    sys.stdout.flush()
    train_generator_PG(gen, gen_optimizer, dis, 1)

    # TRAIN DISCRIMINATOR
    print('\nAdversarial Training Discriminator : ')
    train_discriminator(dis, dis_optimizer, real_train, gen, real_val, ADV_D_STEPS, DIS_EPOCH)
    
    # generate output samples
    samples128 = gen.sample(N_SAMPLES, 256)
    samples512 = gen.sample(N_SAMPLES, 512)
    
    for i in range(N_SAMPLES):
        sample128 = ' '.join([int2word[x] for x in list(np.array(samples128[i]))])
        sample512 = ' '.join([int2word[x] for x in list(np.array(samples512[i]))])
        with open("./data/output/result_epoch" + str(epoch) + "_256" + "_sample" + str(i) + ".txt", "w") as outfile:
                outfile.write(sample128)
        with open("./data/output/result_epoch" + str(epoch) + "_512" + "_sample" + str(i) + ".txt", "w") as outfile:
                outfile.write(sample512)
                
        


Starting Adversarial Training...

--------
EPOCH 1
--------

Adversarial Training Generator: 
Adversarial Training Discriminator : 
d-step 1 epoch 1 : .......... average_loss = 0.0270, train_acc = 0.9908, val_acc = 0.9950
d-step 1 epoch 2 : .......... average_loss = 0.0216, train_acc = 0.9926, val_acc = 0.9900
d-step 1 epoch 3 : .......... average_loss = 0.0185, train_acc = 0.9939, val_acc = 0.9850
d-step 2 epoch 1 : .......... average_loss = 0.0191, train_acc = 0.9933, val_acc = 0.9950
d-step 2 epoch 2 : .......... average_loss = 0.0151, train_acc = 0.9950, val_acc = 0.9900
d-step 2 epoch 3 : .......... average_loss = 0.0124, train_acc = 0.9960, val_acc = 0.9950
d-step 3 epoch 1 : ............d-step 3 epoch 1 : ........ average_loss = 0.0045, train_acc = 0.9987, val_acc = 0.9850
d-step 1 epoch 3 : .......... average_loss = 0.0035, train_acc = 0.9989, val_acc = 1.0000
d-step 2 epoch 1 : .......... average_loss = 0.0052, train_acc = 0.9984, val_acc = 0.9850
d-step 2 epoch 2 : .........

KeyboardInterrupt: 

In [None]:
torch.save(gen, './models/gen_adv_epoch' + str(epoch))
torch.save(dis, './models/dis_adv_epoch' + str(epoch))