In [1]:
require 'hdf5'
require 'rnn'

In [2]:
N = 2
myFile = hdf5.open('../data_preprocessed/'..tostring(N)..'-grams.hdf5','r')
data = myFile:all()
F_train = data['F_train']
input_data_valid = data['input_data_valid']
output_matrix_train = data['output_matrix_train']
input_matrix_train = data['input_matrix_train']
input_data_train = data['input_data_train']
input_data_valid_nospace = data['input_data_valid_nospace']
input_data_test = data['input_data_test']
myFile:close()

In [3]:
print(input_matrix_train:size())
print(output_matrix_train:size())
print(input_data_train:size())

 599903
      1
[torch.LongStorage of size 2]

 599903
[torch.LongStorage of size 1]

 599905
[torch.LongStorage of size 1]



In [185]:
-- Formating the input
-- input is a 1d tensor
function get_train_input(input, len, batch_size)
    -- Building output (we put predict a padding at the end)
    local n = input:size(1)
    
    -- Get the closer multiple of batch_size*len below n
    local factor = -math.floor(-n/(len*batch_size))
    local n_new = factor*len*batch_size
    local input_new = torch.DoubleTensor(n_new)
    local t_input, t_output
    input_new:narrow(1,1,n):copy(input)
    input_new:narrow(1,n,n_new-n+1):fill(2) -- Filling with padding
    
    -- Building output
    local output = torch.DoubleTensor(n_new)
    for i=2, n_new do
        if input_new[i] ~= 1 then
            output[i-1] = 2
        else
            output[i-1] = input_new[i]
        end
    end
    output[n_new] = 2

    -- Issue with last sequence if batch_size does not divide n
    t_input = torch.split(input_new:view(batch_size,n_new/batch_size),len, 2)
    t_output = torch.split(output:view(batch_size,n_new/batch_size),len, 2)
    return t_input, t_output
end 

In [146]:
function build_RNN(embed_dim, rho)
    return nn.Recurrent(embed_dim, nn.Linear(embed_dim, embed_dim),nn.Linear(embed_dim, embed_dim), nn.Tanh(), rho)
end

function build_LSTM(embed_dim, rho)
    return nn.FastLSTM(embed_dim, embed_dim, rho)
end

function build_GRU(embed_dim, rho, dropout_p)
    return nn.GRU(embed_dim, embed_dim, rho,dropout_p)
end

function build_rnn(embed_dim, vocab_size, batch_size, recurrent_model, len)
    local batchRNN
    local params
    local grad_params
    -- generic RNN transduced
    batchRNN = nn.Sequential()
        :add(nn.LookupTable(vocab_size, embed_dim))
        :add(nn.SplitTable(1, batch_size))
    batchRNN:add(nn.Sequencer(recurrent_model))

    -- Output
    batchRNN:add(nn.Sequencer(nn.Linear(embed_dim, 2)))
    batchRNN:add(nn.Sequencer(nn.LogSoftMax()))
    batchRNN:remember('both')

    -- Retrieve parameters (To do only once!!!)
    params, grad_params = batchRNN:getParameters()
    -- Initializing all the parameters between -0.05 and 0.05
    for k=1,params:size(1) do
        params[k] = torch.uniform(-0.05,0.05)
    end
    
    return batchRNN, params, grad_params
end

In [158]:
-- Building output valid matrix
output_valid = torch.DoubleTensor(input_data_valid:size(1))
for i=2, input_data_valid:size(1) do
    if input_data_valid[i] ~= 1 then
        output_valid[i-1] = 2
    else
        output_valid[i-1] = input_data_valid[i]
    end
end

In [62]:
function build_lstm(embed_dim, vocab_size, batch_size)
    local batchRNN
    local params
    local grad_params
    -- Fast LSTM
    batchRNN = nn.Sequential()
        :add(nn.LookupTable(vocab_size, embed_dim))
        :add(nn.SplitTable(1, batch_size))
    batchRNN:add(nn.Sequencer((nn.FastLSTM(embed_dim, embed_dim))))
    -- Output
    batchRNN:add(nn.Sequencer(nn.Linear(embed_dim, 2)))
    batchRNN:add(nn.Sequencer(nn.LogSoftMax()))
    batchRNN:remember('both')
    
    return batchRNN
end

# Training

In [8]:
function train_model(t_input, t_output, model, params, grad_params,
                     criterion, eta, nEpochs, batch_size, len, n)
    -- Train the model with a mini batch SGD
    -- standard parameters are
    -- nEpochs = 1
    -- batchSize = 32
    -- eta = 0.01
    local timer
    local pred
    local loss
    local dLdPred
    local t_inputT = torch.DoubleTensor(len,batch_size)
    local t_output_table
    local delta = 0.2

    -- To store the loss
    local av_L = 0

    for i = 1, nEpochs do
        -- timing the epoch
        timer = torch.Timer()
        old_L = av_L
        av_L = 0
        
        -- mini batch loop
        for k = 1, n/(batch_size * len) do
            -- Mini batch data
                
            t_inputT:copy(t_input[k]:t())
            t_output_table = torch.split(t_output[k],1,2)
            --format the output
            for j=1,len do
                t_output_table[j] = t_output_table[j]:squeeze()
            end 
            
            -- reset gradients
            grad_params:zero()
            
            -- Forward loop
            pred = model:forward(t_inputT)
            loss = criterion:forward(pred, t_output_table)
            av_L = av_L + loss

            -- Backward loop
            dLdPred = criterion:backward(pred, t_output_table)
            model:backward(t_inputT, dLdPred)
            
            -- gradient normalization with max norm 5 (l2 norm)
            grad_params:view(grad_params:size(1),1):renorm(1,2,5)
            model:updateParameters(eta)
            
        end
            
        print('Epoch '..i..': '..timer:time().real)
        print('Average Loss: '..av_L/math.floor(n/batch_size))
        
    end
    
    if (old_L - av_L) < delta then
        eta = eta/2
        delta = delta/2
    end
    
    if (eta < 0.001) then eta = 0.1 end
end

In [191]:
function train_model_with_perp(t_input, t_output, model, model_flattened, params_flattened,
        params, grad_params, criterion, eta, nEpochs, batch_size, len, n, input_valid, output_valid, step)
    -- Train the model with a mini batch SGD
    -- standard parameters are
    -- nEpochs = 1
    -- batchSize = 32
    -- eta = 0.01
    local timer
    local pred
    local loss
    local dLdPred
    local t_inputT = torch.DoubleTensor(len,batch_size)
    local t_output_table
    local delta = 0.2
    local size

    -- To store the loss
    local av_L = 0
    local perp = 0
    local old_perp = 0

    for i = 1, nEpochs do
        -- timing the epoch
        timer = torch.Timer()
        old_L = av_L
        old_perp = perp
        av_L = 0
        
        -- mini batch loop
        for k = 1, n/(batch_size * len) do
            -- Mini batch data
                
            t_inputT:copy(t_input[k]:t())
            t_output_table = torch.split(t_output[k],1,2)
            --format the output
            for j=1,len do
                t_output_table[j] = t_output_table[j]:squeeze()
            end 
            
            -- reset gradients
            grad_params:zero()
            
            -- Forward loop
            pred = model:forward(t_inputT)
            loss = criterion:forward(pred, t_output_table)
            av_L = av_L + loss

            -- Backward loop
            dLdPred = criterion:backward(pred, t_output_table)
            model:backward(t_inputT, dLdPred)
            
            -- gradient normalization with max norm 5 (l2 norm)
            grad_params:view(grad_params:size(1),1):renorm(1,2,5)
            model:updateParameters(eta)
            
        end
            
        print('Epoch '..i..': '..timer:time().real)
        print('Average Loss: '..av_L/math.floor(n/batch_size))
        -- Print perplexity validity every step of iteration
        if (i%step == 0) then
            size = input_valid:size(1) - 1
            params_flattened:copy(params)
            perp = compute_perplexity(input_valid:narrow(1,1,size):view(size,1), output_valid, model_flattened)
            print('Valid perplexity: '..perp)
            
            if math.abs(old_perp - perp) < delta then
                eta = eta/2
                delta = delta/2
            end

            if (eta < 0.0001) then eta = 0.1 end

        end
    end
end

In [142]:
-- Building model
batchRNN, params, grad_params = build_rnn(embed_dim, vocab_size, len)
batchRNN_valid, params_valid, grad_params_valid = build_rnn(embed_dim, vocab_size, 1, len)

crit = nn.SequencerCriterion(nn.ClassNLLCriterion())

In [19]:
nEpochs = 3
train_model(t_input, t_output, batchRNN, params, grad_params,
                     crit, eta, nEpochs, batch_size, len, n_new)

Epoch 1: 6.8074369430542	
Average Loss: 0.31228050204274	


Epoch 2: 6.7043259143829	
Average Loss: 0.26151257461365	


Epoch 3: 6.2467911243439	
Average Loss: 0.23146731025455	


In [94]:
nEpochs = 10
train_model_with_perp(t_input, t_output, batchRNN, batchRNN_valid, params_valid,
        params, grad_params, crit, eta, nEpochs, batch_size, len, n, input_data_valid, output_valid)

Epoch 1: 6.0373430252075	
Average Loss: 0.31127603024063	


Valid perplexity: 1.3167066034457	


Epoch 2: 6.0228929519653	
Average Loss: 0.2595806616951	


Valid perplexity: 1.272589419065	


Epoch 3: 6.0724031925201	
Average Loss: 0.23316074090171	


Valid perplexity: 1.2505899734403	


Epoch 4: 6.7879350185394	
Average Loss: 0.21455134532284	


Valid perplexity: 1.2305572804994	


Epoch 5: 6.1380410194397	
Average Loss: 0.20159015092112	


Valid perplexity: 1.2191191235031	


Epoch 6: 6.0522859096527	
Average Loss: 0.19299361409045	


Valid perplexity: 1.2106942406731	


Epoch 7: 6.0655410289764	
Average Loss: 0.18630843005958	


Valid perplexity: 1.2040019027776	


Epoch 8: 6.0618779659271	
Average Loss: 0.18083623497307	


Valid perplexity: 1.1988415676649	


Epoch 9: 6.0254361629486	
Average Loss: 0.17666558202226	


Valid perplexity: 1.1948545425466	


Epoch 10: 6.0559229850769	
Average Loss: 0.17341161985071	


Valid perplexity: 1.191752075552	


In [96]:
nEpochs = 10
train_model_with_perp(t_input, t_output, batchRNN, batchRNN_valid, params_valid,
        params, grad_params, crit, eta, nEpochs, batch_size, len, n, input_data_valid, output_valid)

Epoch 1: 6.1608598232269	
Average Loss: 0.17066010915357	


Valid perplexity: 1.1893514698956	


Epoch 2: 6.0460610389709	
Average Loss: 0.16842283821082	


Valid perplexity: 1.1876828130608	


Epoch 3: 6.1154489517212	
Average Loss: 0.16659270468897	


Valid perplexity: 1.1864067460075	


Epoch 4: 6.0670311450958	
Average Loss: 0.16518998250815	


Valid perplexity: 1.1853287486702	


Epoch 5: 6.1075801849365	
Average Loss: 0.16408510411659	


Valid perplexity: 1.1845620295284	


Epoch 6: 6.3404068946838	
Average Loss: 0.16316840120459	


Valid perplexity: 1.1840970283144	


Epoch 7: 6.0670158863068	
Average Loss: 0.16239600637254	


Valid perplexity: 1.1837860540369	


Epoch 8: 6.0414438247681	


Average Loss: 0.16169741966653	


Valid perplexity: 1.1833747002796	


Epoch 9: 6.0382349491119	
Average Loss: 0.16102260010531	


Valid perplexity: 1.1828553687182	


Epoch 10: 6.026673078537	
Average Loss: 0.16036868317739	


Valid perplexity: 1.1826217175028	


In [97]:
nEpochs = 10
train_model_with_perp(t_input, t_output, batchRNN, batchRNN_valid, params_valid,
        params, grad_params, crit, eta, nEpochs, batch_size, len, n, input_data_valid, output_valid)

Epoch 1: 6.1355979442596	
Average Loss: 0.15972079728122	


Valid perplexity: 1.1824428813963	


Epoch 2: 6.4839720726013	
Average Loss: 0.15909980418245	


Valid perplexity: 1.182109999531	


Epoch 3: 6.2839608192444	
Average Loss: 0.15852116560753	


Valid perplexity: 1.181668471896	


Epoch 4: 6.2338001728058	
Average Loss: 0.15796460040097	


Valid perplexity: 1.1811241082699	


Epoch 5: 6.8121101856232	
Average Loss: 0.15744148391804	


Valid perplexity: 1.1806673353204	


Epoch 6: 6.2398941516876	
Average Loss: 0.15692715807122	


Valid perplexity: 1.1801969411692	


Epoch 7: 6.1931447982788	
Average Loss: 0.15642420072222	


Valid perplexity: 1.1797258097392	


Epoch 8: 6.2221410274506	
Average Loss: 0.15592075426583	


Valid perplexity: 1.1796751236756	


Epoch 9: 6.2164981365204	
Average Loss: 0.15546064818585	


Valid perplexity: 1.179654980452	


Epoch 10: 6.8618688583374	
Average Loss: 0.15505075502542	


Valid perplexity: 1.1795317990726	


# EXP RNN

In [194]:
len = 40
batch_size = 16
vocab_size = 49
embed_dim = 20
eta = 0.5
nEpochs = 40

t_input_new, t_output_new = get_train_input(input_data_train, len, batch_size)
n_new = len * batch_size *(#t_input_new)
print('Input size is '..n_new)

-- Building model
batchRNN, params, grad_params = build_rnn(embed_dim, vocab_size, batch_size, build_RNN(embed_dim, len), len)
batchRNN_valid, params_valid, grad_params_valid = build_rnn(embed_dim, vocab_size, 1,build_RNN(embed_dim))

crit = nn.SequencerCriterion(nn.ClassNLLCriterion())

Input size is 600320	


In [None]:
-- Adaptive learning rate
nEpochs = 10
step = 1
train_model_with_perp(t_input_new, t_output_new, batchRNN, batchRNN_valid, params_valid,
        params, grad_params, crit, eta, nEpochs, batch_size, len, n_new, input_data_valid, output_valid, step)

Epoch 1: 7.482204914093	
Average Loss: 0.29908019873378	


# EXP GRU

In [None]:
len = 60
batch_size = 16
vocab_size = 49
embed_dim = 20
eta = 0.5
nEpochs = 40

t_input_new, t_output_new = get_train_input(input_data_train, len, batch_size)
n_new = len * batch_size *(#t_input_new)
print('Input size is '..n_new)

-- Building model
batchRNN, params, grad_params = build_rnn(embed_dim, vocab_size, batch_size, build_GRU(embed_dim, len), len)
batchRNN_valid, params_valid, grad_params_valid = build_rnn(embed_dim, vocab_size, 1,build_GRU(embed_dim))

crit = nn.SequencerCriterion(nn.ClassNLLCriterion())

# Predictions

In [16]:
function compute_probability_model(model, input)
    return model:forward(input:view(input:size(1), 1))
end

In [23]:
-- Method to compute manually the perplexity
function compute_perplexity(input, output, model)
    -- Last Position filled in predictions
    -- Position to predict in input
    local position_input = 1
    local probability = torch.DoubleTensor(2)
    local probability_table
    local perp = 0

    -- Build mapping
    for i = 1,input:size(1) do
        -- Line where the model appears
        -- The model remember the states before, just need to feed into it a character
        probability_table = compute_probability_model(model, input:narrow(1,i,1))
        probability:copy(probability_table[1])
        perp = perp + probability[output[i]]
    end
    -- Cutting the output
    return math.exp(-perp/input:size(1))
end   

In [51]:
-- Computing perplexity
timer = torch.Timer()
size = input_data_train:size(1)
perp = compute_perplexity(input_data_train:narrow(1,1,size):view(size,1), output_matrix_train:narrow(1,1,size), batchRNN_valid)
print('Time elasped : '..timer:time().real)
print(perp)

Time elasped : 0.17292881011963	
1.2779225376256	


In [161]:
-- Computing perplexity on valid
timer = torch.Timer()
--size = input_data_valid:size(1)
size = 1000
perp = compute_perplexity(input_data_valid:narrow(1,1,size):view(size,1), output_valid:narrow(1,1,size), batchRNN_valid)
print('Time elasped : '..timer:time().real)
print(perp)

Time elasped : 0.15830206871033	
1.2654810365825	


In [98]:
-- Prediction on test
function predict_rnn_greedy(input, len, model)
    -- Last Position filled in predictions
    local position_prediction = 1
    -- Position to predict in input
    local position_input = 1
    -- We allocate the maximum of memory that could be needed
    -- Default value is -1 (to know where predictions end afterwards)
    local predictions = torch.ones(2*input:size(1)):mul(-1)
    -- Copy the first entry
    predictions[position_prediction] = input[position_input]
    local probability = torch.zeros(2)
    local probability_table

    -- Build mapping
    while position_input < input:size(1) do
        -- Line where the model appears
        -- The model remember the states before, just need to feed into it a character
        probability_table = compute_probability_model(model, predictions:narrow(1,position_prediction, 1))
        probability:copy(probability_table[1])

        m,a = probability:max(1)

        -- Case space predicted
        position_prediction = position_prediction +1
        if (a[1] == 1) then
            predictions[position_prediction] = 1
        else
            -- Copying next character
            position_input = position_input + 1
            predictions[position_prediction] = input[position_input] 
        end
    end
    -- Cutting the output
    return predictions:narrow(1,1,position_prediction)
end   

In [99]:
timer = torch.Timer()
size = input_data_valid_nospace:size(1)
pred_valid = predict_rnn_greedy(input_data_valid_nospace:narrow(1,1,size), len, batchRNN_valid)
print('Time elasped : '..timer:time().real)

Time elasped : 13.989942073822	


In [104]:
-- Pred on test
timer = torch.Timer()
size = input_data_test:size(1)
pred_test = predict_rnn_greedy(input_data_test:narrow(1,1,size), len, batchRNN_valid)
print('Time elasped : '..timer:time().real)

Time elasped : 56.393217802048	


In [104]:
timer = torch.Timer()
size = input_data_valid_nospace:size(1)
pred_valid_lstm = predict_rnn_greedy(input_data_valid_nospace:narrow(1,1,size), len, batch_lsm)
print('Time elasped : '..timer:time().real)

Time elasped : 38.907659053802	


In [100]:
function get_kaggle_format(predictions_test, N)
    -- Counting sentences
    local num_sentence = 0
    for i=N-1,predictions_test:size(1) do
        if predictions_test[i] == 2 then
            num_sentence = num_sentence + 1
        end
    end

    -- Counting space per sentence
    local num_spaces = torch.DoubleTensor(num_sentence,2)
    local row = 1
    local count_space = 0
    for i=N-1,predictions_test:size(1) do
        if predictions_test[i] == 2 then
            num_spaces[{row, 1}] = row
            num_spaces[{row, 2}] = count_space
            count_space = 0
            row = row + 1
        elseif predictions_test[i] == 1 then
            count_space = count_space + 1
        end
    end
    return num_spaces
end

In [101]:
function compute_rmse(true_kaggle, pred_kaggle)
    local rmse = 0
    for i=1,true_kaggle:size(1) do
        rmse = rmse + math.pow(true_kaggle[{i,2}] - pred_kaggle[{i,2}], 2)
    end
    return(math.sqrt(rmse/ true_kaggle:size(1)))
end

In [102]:
kaggle_true_valid = get_kaggle_format(input_data_valid,2)

In [103]:
kaggle_rnn_valid = get_kaggle_format(pred_valid,2)
print('RMSE RNN')
rsme_rnn = compute_rmse(kaggle_true_valid, kaggle_rnn_valid)
print(rsme_rnn)

RMSE RNN	


4.2979129688982	


In [107]:
kaggle_lstm_valid = get_kaggle_format(pred_valid_lstm,2)
print('RMSE LSTM')
rsme_rnn = compute_rmse(kaggle_true_valid, kaggle_lstm_valid)
print(rsme_rnn)

RMSE LSTM	


5.3705971151496	


In [105]:
-- Saaving test prediction
kaggle_test = get_kaggle_format(pred_test,2)

In [106]:
-- Saving the Kaggle format output
myFile = hdf5.open('../submission/pred_rnn_50_16', 'w')
myFile:write('num_spaces', kaggle_test)
myFile:close()