In [1]:
require 'hdf5'
require 'nn'

In [2]:
myFile = hdf5.open('6-grams.hdf5','r')
data = myFile:all()
myFile:close()
print(data)

{
  test : LongTensor - size: 3761x55
  nwords : LongTensor - size: 1
  train_1000_nocounts : LongTensor - size: 696825x7
  train_nocounts : DoubleTensor - size: 887522x6
  train_1000 : DoubleTensor - size: 887522x6
  train : LongTensor - size: 772670x7
  valid : LongTensor - size: 3370x55
  valid_txt : DoubleTensor - size: 70391x6
  valid_output : LongTensor - size: 3370x50
}


In [8]:
train = data['train_nocounts']:narrow(2,1,6)
train_input = train:narrow(2,1,5)
train_output = train:narrow(2,6,1)

valid_txt = data['valid_txt']:narrow(2,1,6)
valid_txt_input = valid_txt:narrow(2,1,5)
valid_txt_output = valid_txt:narrow(2,6,1)

valid_topredict = data['valid']:narrow(2,1,50)
valid_input = data['valid']:narrow(2,51,5)
valid_output = data['valid_output']

# Model

In [9]:
-- Model
nwords = 10001
N = 5
dwin = N
hid1 = 30
hid2 = 100

-- To store the whole model
dnnlm = nn.Sequential()

-- Layer to embedd (and put the words along the window into one vector)
LT = nn.Sequential()
LT_ = nn.LookupTable(nwords,hid1)
LT:add(LT_)
LT:add(nn.View(-1, hid1*dwin))

dnnlm:add(LT)

concat = nn.ConcatTable()

lin_tanh = nn.Sequential()
lin_tanh:add(nn.Linear(hid1*dwin,hid2))
lin_tanh:add(nn.Tanh())

id = nn.Identity()

concat:add(lin_tanh)
concat:add(id)

dnnlm:add(concat)
dnnlm:add(nn.JoinTable(2))
dnnlm:add(nn.Linear(hid1*dwin + hid2, nwords))
dnnlm:add(nn.LogSoftMax())

-- Loss
criterion = nn.ClassNLLCriterion()

In [34]:
-- Testing the model

input = train_input:narrow(1,2,1)
input_batch = train_input:narrow(1,2,2)

In [35]:
print(input_batch:size())

 2
 4
[torch.LongStorage of size 2]



In [92]:
LT:forward(input_batch):size()

   2
 120
[torch.LongStorage of size 2]



In [97]:
output_lt = LT:forward(input_batch)

In [99]:
output_lin_tanh = lin_tanh:forward(output_lt)

In [104]:
nn.JoinTable(2):forward(concat:forward(output_lt)):size()

   2
 220
[torch.LongStorage of size 2]



In [37]:
dnnlm:forward(input_batch):size()

     2
 10001
[torch.LongStorage of size 2]



# Optimization: SGD

In [10]:
nEpochs = 1
batchSize = 32
eta = 0.01
av_L = 0

inputs_batch = torch.DoubleTensor(batchSize,dwin)
targets_batch = torch.DoubleTensor(batchSize)
outputs = torch.DoubleTensor(batchSize, nwords)
df_do = torch.DoubleTensor(batchSize, nwords)

kag_pred_valid = torch.Tensor(valid_input:size(1),50)
norm_mat = torch.Tensor(valid_input:size(1),50)

In [13]:
for i = 20, 23 do
    -- timing the epoch
    timer = torch.Timer()
    av_L = 0
    
    -- max renorm
    LT_.weight:renorm(2,1,1)
    
    -- mini batch loop
    for t = 1, train_input:size(1), batchSize do
        -- Mini batch data
        current_batch_size = math.min(batchSize,train_input:size(1)-t)
        inputs_batch:narrow(1,1,current_batch_size):copy(train_input:narrow(1,t,current_batch_size))
        targets_batch:narrow(1,1,current_batch_size):copy(train_output:narrow(1,t,current_batch_size))
        
        -- reset gradients
        dnnlm:zeroGradParameters()
        --gradParameters:zero()

        -- Forward pass (selection of inputs_batch in case the batch is not full, ie last batch)
        outputs:narrow(1,1,current_batch_size):copy(dnnlm:forward(inputs_batch:narrow(1,1,current_batch_size)))

        -- Average loss computation
        f = criterion:forward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size))
        av_L = av_L +f

        -- Backward pass
        df_do:narrow(1,1,current_batch_size):copy(criterion:backward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size)))
        dnnlm:backward(inputs_batch:narrow(1,1,current_batch_size), df_do:narrow(1,1,current_batch_size))
        dnnlm:updateParameters(eta)
        
    end
        
    print('Epoch '..i..': '..timer:time().real)
    print('Average Loss: '..av_L/math.floor(train_input:size(1)/batchSize))
   
    
    -- Evaluating perplexity on valiadation_txt:
    
    print('Perplexity on valid.txt: '..math.exp(criterion:forward(dnnlm:forward(valid_txt_input),valid_txt_output:squeeze())))
    
    -- Evaluatin perplexity on validation kaggle:
    kag_pred_valid:zero()

    for ii = 1, valid_input:size(1) do
        kag_pred_valid[ii]:copy(dnnlm:forward(valid_input[ii]):index(2, valid_topredict[ii])):exp()
    end
    
    norm_mat:zero()
    norm_mat:copy(torch.expandAs(kag_pred_valid:sum(2), kag_pred_valid))
    kag_pred_valid:cdiv(norm_mat)
    
    CE = 0
    for iii = 1, valid_input:size(1) do
        mm,aa = valid_output[iii]:max(1)
        CE = CE + math.log(kag_pred_valid[iii][aa[1]])
    end
    
    val_res = math.exp(-CE/kag_pred_valid:size(1))
    print('Perplexity on valid: '..val_res)
end

Epoch 20: 815.05766415596	
Average Loss: 5.5035628995259	


Perplexity on valid.txt: 263.29226061724	


Perplexity on valid: 5.7555084760772	


Epoch 21: 870.12553620338	
Average Loss: 5.4824292901193	


Perplexity on valid.txt: 259.40302385606	


Perplexity on valid: 5.723432269876	


Epoch 22: 860.29746389389	
Average Loss: 5.4620796918255	


Perplexity on valid.txt: 255.75137596138	


Perplexity on valid: 5.6930143741932	


Epoch 23: 830.15432596207	
Average Loss: 5.4424575791206	


Perplexity on valid.txt: 252.31421534691	


Perplexity on valid: 5.6641789819409	


# Results on test

In [21]:
test_topredict = data['test']:narrow(2, 1, 50)
test_input = data['test']:narrow(2, 51, 5)
kag_pred_test = torch.Tensor(test_input:size(1), 50)
norm_mat_test = torch.Tensor(test_input:size(1), 50)

for ii = 1, test_input:size(1) do
    kag_pred_test[ii]:copy(dnnlm:forward(test_input[ii]):index(2, test_topredict[ii])):exp()
end

norm_mat_test:zero()
norm_mat_test:copy(torch.expandAs(kag_pred_test:sum(2), kag_pred_test))
kag_pred_test:cdiv(norm_mat_test)

for ii = 1, valid_input:size(1) do
    kag_pred_valid[ii]:copy(dnnlm:forward(valid_input[ii]):index(2, valid_topredict[ii])):exp()
end

norm_mat:zero()
norm_mat:copy(torch.expandAs(kag_pred_valid:sum(2), kag_pred_valid))
kag_pred_valid:cdiv(norm_mat)

In [None]:
kag_pred_test:size()

In [28]:
filename='bengio_3_nico.f5'
myFile = hdf5.open(filename, 'w')
myFile:write('test', kag_pred_test)
myFile:write('valid', kag_pred_valid)
myFile:close()

## nEpochs = 5
batchSize = 32
eta = 0.001
av_L = 0

inputs_batch = torch.DoubleTensor(batchSize,dwin)
targets_batch = torch.DoubleTensor(batchSize)
outputs = torch.DoubleTensor(batchSize, nwords)
df_do = torch.DoubleTensor(batchSize, nwords)


for i = 1, nEpochs do
    -- timing the epoch
    timer = torch.Timer()
    av_L = 0
    
    -- max renorm
    LT_.weight:renorm(2,1,1)
    
    -- mini batch loop
    for t = 1, train_input_1000:size(1), batchSize do
        -- Mini batch data
        current_batch_size = math.min(batchSize,train_input_1000:size(1)-t)
        inputs_batch:narrow(1,1,current_batch_size):copy(train_input_1000:narrow(1,t,current_batch_size))
        targets_batch:narrow(1,1,current_batch_size):copy(train_output_1000:narrow(1,t,current_batch_size))
        
        -- reset gradients
        dnnlm:zeroGradParameters()
        --gradParameters:zero()

        -- Forward pass (selection of inputs_batch in case the batch is not full, ie last batch)
        outputs:narrow(1,1,current_batch_size):copy(dnnlm:forward(inputs_batch:narrow(1,1,current_batch_size)))

        -- Average loss computation
        f = criterion:forward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size))
        av_L = av_L +f

        -- Backward pass
        df_do:narrow(1,1,current_batch_size):copy(criterion:backward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size)))
        dnnlm:backward(inputs_batch:narrow(1,1,current_batch_size), df_do:narrow(1,1,current_batch_size))
        dnnlm:updateParameters(eta)
        
    end
        
    print('Epoch '..i..': '..timer:time().real)
    print('Average Loss: '..av_L/math.floor(train_input_1000:size(1)/batchSize))
    
end

In [10]:
nEpochs = 5
batchSize = 64
eta = 0.01
av_L = 0

inputs_batch = torch.DoubleTensor(batchSize,dwin)
targets_batch = torch.DoubleTensor(batchSize)
outputs = torch.DoubleTensor(batchSize, nwords)
df_do = torch.DoubleTensor(batchSize, nwords)


for i = 1, nEpochs do
    -- timing the epoch
    timer = torch.Timer()
    av_L = 0
    
    -- max renorm
    LT_.weight:renorm(2,1,1)
    
    -- mini batch loop
    for t = 1, train_input_1000:size(1), batchSize do
        -- Mini batch data
        current_batch_size = math.min(batchSize,train_input_1000:size(1)-t)
        inputs_batch:narrow(1,1,current_batch_size):copy(train_input_1000:narrow(1,t,current_batch_size))
        targets_batch:narrow(1,1,current_batch_size):copy(train_output_1000:narrow(1,t,current_batch_size))
        
        -- reset gradients
        dnnlm:zeroGradParameters()
        --gradParameters:zero()

        -- Forward pass (selection of inputs_batch in case the batch is not full, ie last batch)
        outputs:narrow(1,1,current_batch_size):copy(dnnlm:forward(inputs_batch:narrow(1,1,current_batch_size)))

        -- Average loss computation
        f = criterion:forward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size))
        av_L = av_L +f

        -- Backward pass
        df_do:narrow(1,1,current_batch_size):copy(criterion:backward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size)))
        dnnlm:backward(inputs_batch:narrow(1,1,current_batch_size), df_do:narrow(1,1,current_batch_size))
        dnnlm:updateParameters(eta)
        
    end
        
    print('Epoch '..i..': '..timer:time().real)
    print('Average Loss: '..av_L/math.floor(train_input_1000:size(1)/batchSize))
    
end

Epoch 1: 75.719837903976	
Average Loss: 4.8516148841494	


Epoch 2: 70.374673128128	
Average Loss: 4.7898740237402	


Epoch 3: 65.336530208588	
Average Loss: 4.737851349316	


Epoch 4: 67.845108985901	
Average Loss: 4.687586092071	


Epoch 5: 75.712246894836	
Average Loss: 4.640044784213	


# Whole dataset

In [15]:
nwords = data['nwords'][1]
train = data['train_nocounts']:narrow(2,1,5)
train_input = train:narrow(2,1,4)
train_output = train:narrow(2,5,1)
print(train:size())

 887522
      5
[torch.LongStorage of size 2]



In [16]:
nEpochs = 1
batchSize = 32
eta = 0.01
av_L = 0

inputs_batch = torch.DoubleTensor(batchSize,dwin)
targets_batch = torch.DoubleTensor(batchSize)
outputs = torch.DoubleTensor(batchSize, nwords)
df_do = torch.DoubleTensor(batchSize, nwords)


for i = 1, nEpochs do
    -- timing the epoch
    timer = torch.Timer()
    av_L = 0
    
    -- max renorm
    LT_.weight:renorm(2,1,1)
    
    -- mini batch loop
    for t = 1, train_input:size(1), batchSize do
        -- Mini batch data
        current_batch_size = math.min(batchSize,train_input:size(1)-t)
        inputs_batch:narrow(1,1,current_batch_size):copy(train_input:narrow(1,t,current_batch_size))
        targets_batch:narrow(1,1,current_batch_size):copy(train_output:narrow(1,t,current_batch_size))
        
        -- reset gradients
        dnnlm:zeroGradParameters()
        --gradParameters:zero()

        -- Forward pass (selection of inputs_batch in case the batch is not full, ie last batch)
        outputs:narrow(1,1,current_batch_size):copy(dnnlm:forward(inputs_batch:narrow(1,1,current_batch_size)))

        -- Average loss computation
        f = criterion:forward(outputs:narrow(1,1,current_batch_size), targets_batch:narrow(1,1,current_batch_size))
        av_L = av_L +f

        -- Backward pass
        df_do:narrow(1,1,current_batch_size):copy(criterion:backward(outputs:narrow(1,1,current_batch_size),
                targets_batch:narrow(1,1,current_batch_size)))
        dnnlm:backward(inputs_batch:narrow(1,1,current_batch_size), df_do:narrow(1,1,current_batch_size))
        dnnlm:updateParameters(eta)
        
    end
        
    print('Epoch '..i..': '..timer:time().real)
    print('Average Loss: '..av_L/math.floor(train_input:size(1)/batchSize))
    
end

Epoch 1: 957.66832304001	
Average Loss: 6.4620778781752	
