In [9]:
require 'nn'
require 'hdf5'

# Loading data

In [10]:
-- Train format is (number of Ngrams, Ngram_size + 1) with last
-- col the count of the N_gram of the line

-- Validation format is (number of words to predict, 50 + Ngrams_size -1)
-- where the 50 columns stands for the 50 words possibilities in the prediction,
-- the next col stands for the current context (goal is to predict the Nth word)

myFile = hdf5.open('1-grams.hdf5','r')
data = myFile:all()
train_1 = data['train']
myFile:close()

myFile = hdf5.open('2-grams.hdf5','r')
data = myFile:all()
train_2 = data['train']
validation_2 = data['valid']
validation_output = data['valid_output']
myFile:close()

myFile = hdf5.open('3-grams.hdf5','r')
data = myFile:all()
train_3 = data['train']
test_3 = data['test']
validation_3 = data['valid']
myFile:close()

myFile = hdf5.open('4-grams.hdf5','r')
data = myFile:all()
train_4 = data['train']
validation_4 = data['valid']
myFile:close()

In [55]:
myFile = hdf5.open('5-grams.hdf5','r')
data = myFile:all()
train_5 = data['train']
test_5 = data['test']
validation_5 = data['valid']
myFile:close()

In [193]:
myFile = hdf5.open('6-grams.hdf5','r')
data = myFile:all()
train_6 = data['train']
test_6 = data['test']
validation_output = data['valid_output']
validation_6 = data['valid']
myFile:close()

In [2]:
myFile = hdf5.open('7-grams.hdf5','r')
data = myFile:all()
train_7 = data['train']
test_7 = data['test']
validation_7 = data['valid']
myFile:close()

# Maximum Likelihood Estimation

In [11]:
-- Loading train of the gram_size N
function get_train(N)
    local filename = N .. '-grams.hdf5'
    --print(filename)
    myFile = hdf5.open(filename,'r')
    train = myFile:all()['train']
    myFile:close()
    return train
end

In [16]:
function perplexity(distribution, true_words)
    -- exp of the average of the cross entropy of the true word for each line
    -- true words (N_words to predict, one hot true value among 50)
    local perp = 0
    local N = true_words:size(1)
    for i = 1,N do
        mm,aa = true_words[i]:max(1)
        perp = perp + math.log(distribution[{i, aa[1]}])
    end
    perp = math.exp(- perp/N)
    return perp
end

In [95]:
-- Second version: reversing the order in F_c_w

function build_context_count(count_tensor)
    local indexes
    local indexN
    -- Ngram count (depend on w and context)
    -- {'index1-...-indexN-1': {'indexN' : count}}
    local F_c_w = {}
    -- F_c dict (independent of w, only context based)
    -- {index1-...-indexN-1 : count all words in c}
    local F_c = {}
    -- N_c dict (independent of w, only context based)
    -- {index1-...-indexN-1 : count unique type of words in c}
    local N_c = {}

    local N = count_tensor:size(1)
    local M = count_tensor:size(2)

    for i=1, N do
        indexN = count_tensor[{i,M-1}]
        
        -- build the key index1-...-indexN-1
        indexes = tostring(count_tensor[{i,1}])
        for j=2, M - 2 do
            indexes = indexes .. '-' .. tostring(count_tensor[{i,j}])
        end
        
        -- Filling F_c_w
        if F_c_w[indexes] == nil then
            F_c_w[indexes] = {[indexN] = count_tensor[{i, M}]}
        else
            F_c_w[indexes][indexN] = count_tensor[{i, M}]
        end
        
        -- Updating F_c and F_c
        if F_c[indexes] == nil then
            F_c[indexes] = count_tensor[{i, M}]
            N_c[indexes] = 1
        else
            F_c[indexes] = count_tensor[{i, M}] + F_c[indexes]
            N_c[indexes] = 1 + N_c[indexes]
        end
    end
    
    return F_c_w, F_c, N_c
end

In [109]:
function compute_mle_line(N, entry, F_c_w, alpha)
    -- Compute the maximum likelihood estimation with alpha smoothing on the 
    -- input in entry, 
    --
    -- Return vector (50) predicting the distribution from entry
    -- N represent the Ngram size used in the prediction so context is N-1 gram
    local prediction = torch.zeros(50)
    local indexN
    
    -- context (at least with one element)
    local indexes = tostring(entry[{1, entry:size(2)}])
    for j=entry:size(2) - 1, entry:size(2) - 1 - (N-3), -1 do
        indexes = tostring(entry[{1, j}]) .. '-' .. indexes
    end
    -- check if context is unseen, otherwise go to next context
    if F_c_w[indexes] == nil then
        --print('unseen context')
        prediction:fill(alpha)
    else
        -- Compute MLE for each word
        for j=1,50 do
            indexN = entry[{1, j}]
            if F_c_w[indexes][indexN] ~= nil then
                prediction[j] = F_c_w[indexes][indexN] + alpha
            else
                --print('unseen word')
                prediction[j] = alpha
            end
        end
    end

    return prediction:div(prediction:sum())
end

In [97]:
-- Prediction with the MLE (with Laplace smoothing, no back-off and interpolation)

function mle_proba(N, data, alpha)
    -- Output format: distribution predicted for each N word along the
    -- 50 possibilities
    local N_data = data:size(1)
    
    -- Train model
    local train = get_train(N)
    local F_c_w = build_context_count(train)

    -- Prediction
    local distribution = torch.zeros(N_data, 50)
    for i=1, N_data do
        distribution:narrow(1, i, 1):copy(compute_mle_line(N, data:narrow(1,i,1), F_c_w, alpha))
    end
    
    return distribution
end

In [120]:
-- Test of mle version
distribution_mle = mle_proba(2, validation_2, 0.01)

In [121]:
print('Result on alpha smoothing 2grams', perplexity(distribution_mle, validation_output))

Result on alpha smoothing 2grams	6.0175980155407	


# Witten Bell Model

In [122]:
function compute_wb_line(N, entry, F_c_w_table, alpha)
    -- Compute the interpolated Witten-Bell model where we jump tp lower
    -- order models if the context count is 0 or all the words counts in that
    -- context is 0 also.
    --
    -- Return vector (50) predicting the distribution from entry
    -- N represent the Ngram size used in the prediction so context is N-1 gram
    -- alpha is only used for the MLE without any context
    --
    -- NB: the normalization is done based on the words contained in the first 50
    -- columns of the entry as we are building a distribution on a sub sample of a
    -- dictionnary (so we are using the count only of these words to normalize).
    -- Hence the variable denom and N_c_local
    local prediction = torch.zeros(50)
    local indexN
    local indexes
    local denom
    local N_c_local 
    
    -- case where computation only on the prior
    if N == 1 then
        for j=1,50 do
            indexN = entry[{1, j}]
            -- Corner case when prediction on words not on the dict (case for <s>)
            if F_c_w_table[1][tostring(indexN)] == nil then
                prediction[j] = 0
            else
                prediction[j] = F_c_w_table[1][tostring(indexN)][indexN] + alpha
            end
        end
        -- Normalizing
        return prediction:div(prediction:sum(1)[1])
    else
        -- Compute the MLE for current N
        -- context (at least with one element)
        indexes = tostring(entry[{1, entry:size(2)}])
        for j=entry:size(2) - 1, entry:size(2) - 1 - (N-3), -1 do
            indexes = tostring(entry[{1, j}]) .. '-' .. indexes
        end
        
        -- check if context is unseen, otherwise go to next context
        if F_c_w_table[N][indexes] == nil then
            --print('unseen context')
            return compute_wb_line(N-1, entry, F_c_w_table, alpha)
        end
        
        -- local variable initialization
        denom = 0
        N_c_local = 0
        -- Compute MLE for each word
        for j=1,50 do
            indexN = entry[{1, j}]
            if F_c_w_table[N][indexes][indexN] ~= nil then
                prediction[j] = F_c_w_table[N][indexes][indexN]
                denom = denom + F_c_w_table[N][indexes][indexN] + 1
                N_c_local = N_c_local + 1
            end
        end
        
        -- Check that MLE predicted at least one words, otherwise go to next context
        if prediction:sum(1)[1] == 0 then
            --print('unseen words')
            return compute_wb_line(N-1, entry, F_c_w_table, alpha)
        end
        
        -- Combining with next context
        prediction:add(compute_wb_line(N-1, entry, F_c_w_table, alpha):mul(N_c_local)):div(denom)
        -- Normalization
        -- TODO: We normalize as we apply on a reduced dataest (50 words)
        -- prediction:div(prediction:sum(1)[1])
        return prediction
    end
end

In [123]:
-- Witten Bell: new version, computation done at once line by line
--
-- p_wb(w|c) = (F_c_w + N_c_. * p_wb(w|c'))/(N_c_. + F_c_.)
function distribution_proba_WB(N, data, alpha)
    local N_data = data:size(1)
    local M = data:size(2)

    -- Building the count matrix for each ngram size lower than N.
    local F_c_w_table = {}
    for i=1,N do
        train = get_train(i)
        F_c_w_table[i] = build_context_count(train)
    end

    -- Vector initialisation
    local distribution = torch.zeros(N_data, 50)
    for i=1,N_data do
        -- Compute witten bell for the whole line i
        distribution:narrow(1, i, 1):copy(compute_wb_line(N, data:narrow(1,i,1), F_c_w_table, alpha))
    end
    --distribution:cdiv(distribution:sum(2):expand(distribution:size(1), distribution:size(2)))
    return distribution
end

In [126]:
distribution_wb = distribution_proba_WB(2, validation_2, 3)

In [127]:
print('Result on alpha smoothing 2grams', perplexity(distribution_wb, validation_output))

Result on alpha smoothing 2grams	3.6367209232302	


In [204]:
distribution_wb = distribution_proba_WB(6, validation_6, 10)
print('Result on alpha smoothing 2grams', perplexity(distribution_wb, validation_output))

Result on alpha smoothing 2grams	3.6201823235489	


# Modified Kneser-Ney smoothing

In [128]:
-- Version tailored for modified Kneser-Ney

function build_context_count_split(count_tensor, K)
    -- count_tensor in format (N_words, N + 1):
    -- col1, ..., colN = indexes for the Ngram, colN+1 = N_gram count
    -- K: number of count separate cases (need K > 1, usually K = 3)
    --
    -- Ngram count (depend on w and context)
    -- {'index1-...-indexN-1': {'indexN' : count}}
    local F_c_w = {}
    -- F_c dict (independent of w, only context based)
    -- {index1-...-indexN-1 : count all words in c}
    local F_c = {}
    -- N_c dict (independent of w, only context based)
    -- {k: {index1-...-indexN-1 : # words with count k in c}}
    local N_c_split = {}
    for j=1,K do
        N_c_split[j] = {}
    end
    -- n_table: stores the total number of N_grams with exact number of occurences
    -- stored in their key k: # N_grams with exactly k occurences
    local n_table = {}
    for j=1,K+1 do
        n_table[j] = 0
    end

    local N = count_tensor:size(1)
    local M = count_tensor:size(2)

    for i=1, N do
        local indexN = count_tensor[{i,M-1}]
        
        -- build the key index1-...-indexN-1
        indexes = tostring(count_tensor[{i,1}])
        for j=2, M - 2 do
            indexes = indexes .. '-' .. tostring(count_tensor[{i,j}])
        end
        
        -- Filling F_c_w
        if F_c_w[indexes] == nil then
            F_c_w[indexes] = {[indexN] = count_tensor[{i, M}]}
        else
            F_c_w[indexes][indexN] = count_tensor[{i, M}]
        end
        
        -- Building the key to update the corresponding part of N_c_split
        if count_tensor[{i, M}] > K then
            key_N_c = K
        else
            key_N_c = count_tensor[{i, M}]
        end
        
        -- Updating N_c_split
        if N_c_split[key_N_c][indexes] == nil then
            N_c_split[key_N_c][indexes] = 1
        else
            N_c_split[key_N_c][indexes] = 1 + N_c_split[key_N_c][indexes]
        end
        
        -- Updating F_c
        if F_c[indexes] == nil then
            F_c[indexes] = count_tensor[{i, M}]
        else
            F_c[indexes] = count_tensor[{i, M}] + F_c[indexes]
        end
        
        -- Updating n_table
        if count_tensor[{i, M}] <= K + 1 then
            n_table[count_tensor[{i, M}]] = n_table[count_tensor[{i, M}]] + 1
        end
    end
    
    -- Compute the D term from n_table
    local Y = n_table[1]/(n_table[1] + 2*n_table[2])
    local D = {}
    for k=1,K do
        D[k] = k - (1 + k)*Y*n_table[1 + k]/n_table[k]
    end
    -- Debugg
    -- print(D)
    return F_c_w, F_c, N_c_split, D
end

In [129]:
function compute_mkn_line(N, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K)
    -- Compute the Modified Kneser Ney model where we jump to lower
    -- order models if the context count is 0 or all the words counts in that
    -- context is 0 also.
    --
    -- Return vector (50) predicting the distribution from entry
    -- N represent the Ngram size used in the prediction so context is N-1 gram
    -- alpha is only used for the MLE without any context
    --
    -- No local normalization, should be called to predict on the whole dictionnary
    local prediction = torch.zeros(50)
    local indexN
    -- case where computation only on the prior
    if N == 1 then
        for j=1,50 do
            indexN = entry[{1, j}]
            -- Corner case when prediction on words not on the dict (case for <s>)
            if F_c_w_table[1][tostring(indexN)] == nil then
                prediction[j] = 0
            else
                prediction[j] = F_c_w_table[1][tostring(indexN)][indexN] + alpha
            end
        end
        -- Normalizing
        return prediction:div(prediction:sum(1)[1])
    else
        -- Compute the MLE for current N
        -- context (at least with one element)
        local indexes = tostring(entry[{1, entry:size(2)}])
        for j=entry:size(2) - 1, entry:size(2) - 1 - (N-3), -1 do
            indexes = tostring(entry[{1, j}]) .. '-' .. indexes
        end
        -- check if context is unseen, otherwise go to next context
        if F_c_w_table[N][indexes] == nil then
            --print('unseen context')
            return compute_mkn_line(N-1, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K)
        end
        -- Compute curent order level with modified absolute discouting for each word
        for j=1,50 do
            indexN = entry[{1, j}]
            -- case word seen
            if F_c_w_table[N][indexes][indexN] ~= nil then
                -- Building the key for the different case of absolute discounting
                if F_c_w_table[N][indexes][indexN] > K then
                    key_N_c = K
                else
                    key_N_c = F_c_w_table[N][indexes][indexN]
                end
                prediction[j] = F_c_w_table[N][indexes][indexN] - D_table[N][key_N_c]
            end
        end
        
        -- Check that MLE predicted at least one words, otherwise go to next context
        if prediction:sum(1)[1] == 0 then
            --print('unseen words')
            return compute_mkn_line(N-1, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K)
        end
        
        -- Computing factor of lower order model (no denominator because we normalize afterwards)
        local gamma = 0
        for k=1,K do
            if N_c_split_table[N][k][indexes] ~= nil then
                gamma = gamma + D_table[N][k]*N_c_split_table[N][k][indexes]
            end
        end
        if gamma == 0 then
            print('gamma error')
        end
        -- Combining with next context
        prediction:add(compute_mkn_line(N-1, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K):mul(gamma)):div(F_c_table[N][indexes])
        -- Normalization
        -- TODO: why??? We normalize at the end
        -- prediction:div(prediction:sum(1)[1])
        return prediction
    end
end

In [138]:
-- Modified Kneser Ney: computation done at once line by line
--
-- p_wb(w|c) = (F_c_w + N_c_. * p_wb(w|c'))/(N_c_. + F_c_.)
function distribution_proba_mKN(N, data, alpha, K)
    local N_data = data:size(1)
    local M = data:size(2)

    -- Building the count matrix for each ngram size lower than N.
    local F_c_w_table = {}
    local F_c_table = {}
    local N_c_split_table = {}
    local D_table = {}
    for i=1,N do
        train = get_train(i)
        F_c_w_table[i], F_c_table[i], N_c_split_table[i], D_table[i] = build_context_count_split(train, K)
    end

    -- Vector initialisation
    local distribution = torch.zeros(N_data, 50)
    for i=1,N_data do
        -- Compute witten bell for the whole line i
        distribution:narrow(1, i, 1):copy(compute_mkn_line(N, data:narrow(1,i,1), F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K))
    end
    distribution:cdiv(distribution:sum(2):expand(distribution:size(1), distribution:size(2)))
    return distribution
end

In [135]:
distribution_mkn = distribution_proba_mKN(4, validation_4, 1, 3)

In [137]:
print('Result on alpha smoothing 2grams', perplexity(distribution_mkn, validation_output))

Result on alpha smoothing 2grams	4.283108778554	


In [16]:
-- Test on current best configuration with 6-grams (alpha = 7, K = 2)
distribution = distribution_proba_mKN(6, validation_6, 7, 2)
print('Result on 6grams, alpha 7, K 2', perplexity(distribution, validation_output))

Result on 6grams, alpha 7, K 2	4.2067088977711	


# New local normalization on mKN

In [139]:
-- Version tailored for modified Kneser-Ney:
-- Modif: now we enable a local computation of D
-- (that will be based on the sub vocabulary used in the validation and tesst)

function build_context_count_split2(count_tensor, K)
    -- count_tensor in format (N_words, N + 1):
    -- col1, ..., colN = indexes for the Ngram, colN+1 = N_gram count
    -- K: number of count separate cases (need K > 1, usually K = 3)
    --
    -- Ngram count (depend on w and context)
    -- {'index1-...-indexN-1': {'indexN' : count}}
    local F_c_w = {}
    -- n_table: stores the total number of N_grams ending with indexN
    -- with exact number of occurences stored in their key k:
    -- {k : {'indexN': # N_grams ending with indexN with exactly k occurences}}
    local n_table = {}
    for j=1,K+1 do
        n_table[j] = {}
    end

    local N = count_tensor:size(1)
    local M = count_tensor:size(2)

    for i=1, N do
        local indexN = count_tensor[{i,M-1}]
        
        -- build the key index1-...-indexN-1
        indexes = tostring(count_tensor[{i,1}])
        for j=2, M - 2 do
            indexes = indexes .. '-' .. tostring(count_tensor[{i,j}])
        end
        
        -- Filling F_c_w
        if F_c_w[indexes] == nil then
            F_c_w[indexes] = {[indexN] = count_tensor[{i, M}]}
        else
            F_c_w[indexes][indexN] = count_tensor[{i, M}]
        end
        
        -- Building the key to update the corresponding part of n_table
        if count_tensor[{i, M}] > K then
            key_N_c = K
        else
            key_N_c = count_tensor[{i, M}]
        end
        
        -- Updating n_table
        if count_tensor[{i, M}] <= K + 1 then
            if n_table[count_tensor[{i, M}]][indexN] == nil then
                n_table[count_tensor[{i, M}]][indexN] = 1
            else
                n_table[count_tensor[{i, M}]][indexN] = n_table[count_tensor[{i, M}]][indexN] + 1
            end
        end
    end

    return F_c_w, n_table
end

In [217]:
-- V2: with local normalization on the validation sub vocabulary

function compute_mkn_line2(N, entry, F_c_w_table, n_table, alpha, K, D)
    -- Compute the Modified Kneser Ney model where we jump to lower
    -- order models if the context count is 0 or all the words counts in that
    -- context is 0 also.
    --
    -- Return vector (50) predicting the distribution from entry
    -- N represent the Ngram size used in the prediction so context is N-1 gram
    -- alpha is only used for the MLE without any context
    local prediction = torch.zeros(50)
    local indexN
    local F_local
    local N_c_local = {}
    for k=1,K do
        N_c_local[k] = 0
    end
    local n_table_local = {}
    for k=1,K+1 do
        n_table_local[k] = 0
    end
    
    -- case where computation only on the prior
    if N == 1 then
        for j=1,50 do
            indexN = entry[{1, j}]
            -- Corner case when prediction on words not on the dict (case for <s>)
            if F_c_w_table[1][tostring(indexN)] == nil then
                prediction[j] = 0
            else
                prediction[j] = F_c_w_table[1][tostring(indexN)][indexN] + alpha
            end
        end
        -- Normalizing
        return prediction:div(prediction:sum(1)[1])
    else
        -- Compute the MLE for current N
        -- context (at least with one element)
        local indexes = tostring(entry[{1, entry:size(2)}])
        for j=entry:size(2) - 1, entry:size(2) - 1 - (N-3), -1 do
            indexes = tostring(entry[{1, j}]) .. '-' .. indexes
        end
        -- check if context is unseen, otherwise go to next context
        if F_c_w_table[N][indexes] == nil then
            --print('unseen context')
            return compute_mkn_line2(N-1, entry, F_c_w_table, n_table, alpha, K, D)
        end

        -- Building local n_table
        for j=1,50 do
            indexN = entry[{1, j}]
            -- Updating local n_table
            for k=1,K+1 do
                -- Possible Case where there is no Ngrams ending with indexN with count of K 
                if n_table[N][k][indexN] ~= nil then
                    n_table_local[k] = n_table_local[k] + n_table[N][k][indexN]
                end
            end
        end

        -- Check no 0 in n_table_local
        for k=1,K+1 do
            if n_table_local[k] == 0 then
                print('0 count in n_table_local for ', indexN, k, N)
                n_table_local[k] = 1
            end
        end
        
        -- Building D (needed to compute prediction rows)
        -- Computing local D

        if D == nil then 
            local Y = n_table_local[1]/(n_table_local[1] + 2*n_table_local[2])
            D = {}
            for k=1,K do
               D[k] = k - (1 + k)*Y*n_table_local[1 + k]/n_table_local[k]
            end
        end

        F_local = 0
        -- Compute curent order level with modified absolute discouting for each word
        for j=1,50 do
            indexN = entry[{1, j}]
            -- case word seen
            if F_c_w_table[N][indexes][indexN] ~= nil then
                -- Building the key for the different case of absolute discounting
                if F_c_w_table[N][indexes][indexN] > K then
                    key_N_c = K
                else
                    key_N_c = F_c_w_table[N][indexes][indexN]
                end
                prediction[j] = F_c_w_table[N][indexes][indexN] - D[key_N_c]
                F_local = F_local + F_c_w_table[N][indexes][indexN]
                N_c_local[key_N_c] = N_c_local[key_N_c] + 1
            end
        end

        -- Check that MLE predicted at least one words, otherwise go to next context
        if prediction:sum(1)[1] == 0 then
            --print('unseen words')
            return compute_mkn_line2(N-1, entry, F_c_w_table, n_table, alpha, K, D)
        end
        
        -- Computing factor of lower order model (no denominator because we normalize afterwards)
        local gamma = 0
        for k=1,K do
            if N_c_local[k] ~= nil then
                gamma = gamma + D[k]*N_c_local[k]
            end
        end
        if gamma < 0 then
            --print('gamma error')
            return compute_mkn_line2(N-1, entry, F_c_w_table, n_table, alpha, K, D)
        end
        -- Combining with next context
        prediction:add(compute_mkn_line2(N-1, entry, F_c_w_table, n_table, alpha, K, D):mul(gamma)):div(F_local)
        -- Normalization
        -- TODO: why??? We normalize at the end
        -- prediction:div(prediction:sum(1)[1])
        return prediction
    end
end

In [158]:
-- Modified Kneser Ney: computation done at once line by line
--
-- p_wb(w|c) = (F_c_w + N_c_. * p_wb(w|c'))/(N_c_. + F_c_.)
function distribution_proba_mKN2(N, data, alpha, K, D)
    local N_data = data:size(1)
    local M = data:size(2)

    -- Building the count matrix for each ngram size lower than N.
    local F_c_w_table = {}
    local n_table = {}
    for i=1,N do
        train = get_train(i)
        F_c_w_table[i], n_table[i] = build_context_count_split2(train, K)
    end

    -- Vector initialisation
    local distribution = torch.zeros(N_data, 50)
    for i=1,N_data do
        -- Compute witten bell for the whole line i
        distribution:narrow(1, i, 1):copy(compute_mkn_line2(N, data:narrow(1,i,1), F_c_w_table, n_table, alpha, K, D))
    end
    --distribution:cdiv(distribution:sum(2):expand(distribution:size(1), distribution:size(2)))
    return distribution
end

In [152]:
distribution_6_mkn = distribution_proba_mKN2(3, validation_3, 15, 3)

print('Perplexity:', perplexity(distribution_6_mkn, validation_output))

In [162]:
-- CV on D
perp = torch.zeros(27)
indextoD = {}
D1 = {0.4, 0.5, 0.6}
D2 = {1.2, 1.3, 1.4}
D3 = {1.6, 1.75, 1.9}
count = 1

for i=1, 3 do
    for j=1,3 do
        for k=1,3 do
            D = {[1] = D1[i], [2] = D2[j], [3] = D3[k]}
            distribution_mkn = distribution_proba_mKN2(3, validation_3, 15, 3, D)
            p = perplexity(distribution_mkn, validation_output)
            print('Perplexity:', p, 'D', D)
            perp[count] = p
            indextoD[count] = D
            count = count + 1
        end
    end
end

Perplexity:	3.5349357395564	D	{
  1 : 0.4
  2 : 1.2
  3 : 1.6
}


Perplexity:	3.5330696515949	D	{
  1 : 0.4
  2 : 1.2
  3 : 1.75
}


Perplexity:	3.5332328151701	D	{
  1 : 0.4
  2 : 1.2
  3 : 1.9
}


Perplexity:	3.5404224547111	D	{
  1 : 0.4
  2 : 1.3
  3 : 1.6
}


Perplexity:	3.5386501324031	D	{
  1 : 0.4
  2 : 1.3
  3 : 1.75
}


Perplexity:	3.5389002759927	D	{
  1 : 0.4
  2 : 1.3
  3 : 1.9
}


Perplexity:	3.5477734285685	D	{
  1 : 0.4
  2 : 1.4
  3 : 1.6
}


Perplexity:	3.5460833817531	D	{
  1 : 0.4
  2 : 1.4
  3 : 1.75
}


Perplexity:	3.5464116718229	D	{
  1 : 0.4
  2 : 1.4
  3 : 1.9
}


Perplexity:	3.4978931128999	D	{
  1 : 0.5
  2 : 1.2
  3 : 1.6
}


Perplexity:	3.4965478301411	D	{
  1 : 0.5
  2 : 1.2
  3 : 1.75
}


Perplexity:	3.4971593351065	D	{
  1 : 0.5
  2 : 1.2
  3 : 1.9
}


Perplexity:	3.5036019166258	D	{
  1 : 0.5
  2 : 1.3
  3 : 1.6
}


Perplexity:	3.5023434772225	D	{
  1 : 0.5
  2 : 1.3
  3 : 1.75
}


Perplexity:	3.5030362842412	D	{
  1 : 0.5
  2 : 1.3
  3 : 1.9
}


Perplexity:	3.5111287870464	D	{
  1 : 0.5
  2 : 1.4
  3 : 1.6
}


Perplexity:	3.5099467633812	D	{
  1 : 0.5
  2 : 1.4
  3 : 1.75
}


Perplexity:	3.5107129355715	D	{
  1 : 0.5
  2 : 1.4
  3 : 1.9
}


Perplexity:	3.4782131364604	D	{
  1 : 0.6
  2 : 1.2
  3 : 1.6
}


Perplexity:	3.4773065411832	D	{
  1 : 0.6
  2 : 1.2
  3 : 1.75
}


Perplexity:	3.4783068978757	D	{
  1 : 0.6
  2 : 1.2


  3 : 1.9
}


Perplexity:	3.4841469964138	D	{
  1 : 0.6
  2 : 1.3
  3 : 1.6
}


Perplexity:	3.4833220525745	D	{
  1 : 0.6
  2 : 1.3
  3 : 1.75
}


Perplexity:	3.4843995222808	D	{
  1 : 0.6
  2 : 1.3
  3 : 1.9
}


Perplexity:	3.491870129018	D	{
  1 : 0.6
  2 : 1.4
  3 : 1.6
}


Perplexity:	3.4911172611547	D	{
  1 : 0.6
  2 : 1.4
  3 : 1.75
}


Perplexity:	3.492264636559	D	{
  1 : 0.6
  2 : 1.4
  3 : 1.9
}


In [177]:
m, a = perp:min(1)
print('Best D is', indextoD[a[1]])
best_D = indextoD[a[1]]

Best D is	{
  1 : 0.6
  2 : 1.2
  3 : 1.75
}


In [203]:
best_D[4] = 2.1
distribution_mkn = distribution_proba_mKN2(3, validation_3, 15, 4, best_D)

print('Perplexity:', perplexity(distribution_mkn, validation_output))

0 count in n_table_local for 	1454	5	3	


0 count in n_table_local for 	3720	5	3	


0 count in n_table_local for 	5928	5	3	


0 count in n_table_local for 	5000	5	3	


0 count in n_table_local for 	1038	5	3	


0 count in n_table_local for 	1480	5	3	


Perplexity:	3.4756255721969	


In [216]:
-- Mixing 2 models
w = 0.7

distribution_mixed = torch.mul(distribution_mkn, w):add(torch.mul(distribution_wb,(1 - w)))
print('Perplexity:', perplexity(distribution_mixed, validation_output))

Perplexity:	3.4619516690308	


# Applying on test Kaggle

In [51]:
-- Applying on test
distribution_test = distribution_proba_WB(6, test_6, 8)

In [24]:
-- Saving the current best model
myFile = hdf5.open('pred_test_wb_fnorm_6', 'w')
myFile:write('distribution', distribution_test)
myFile:close()

In [29]:
F_c_w_table = {}
n_table = {}
for i=1,3 do
    train = get_train(i)
    F_c_w_table[i], n_table[i] = build_context_count_split2(train, 3)
end


In [49]:
t = compute_mkn_line2(3, test_3:narrow(1,2357,1), F_c_w_table, n_table, 5, 3)

{
  1 : 0.83764940239044
  2 : 1.4604098941657
  3 : -0.063403528742174
}
41	37.063403528742	


gamma error	
{
  1 : 0.71739130434783
  2 : 1.3678929765886
  3 : 1.0869565217391
}
37	0.28260869565217	
41	38.913043478261	


In [218]:
distribution_mkn_test = distribution_proba_mKN2(3, test_3, 15, 4, best_D)
distribution_wb_test = distribution_proba_WB(6, test_6, 10)
distribution_mixed_test = torch.mul(distribution_mkn_test, w):add(torch.mul(distribution_wb_test,(1 - w)))

0 count in n_table_local for 	1454	5	3	


0 count in n_table_local for 	1017	5	3	


0 count in n_table_local for 	5928	5	3	


In [220]:
distribution_mixed_test:narrow(1,1,1)

Columns 1 to 10
 0.0021  0.0002  0.0006  0.0034  0.0006  0.0002  0.0002  0.0002  0.0027  0.0002

Columns 11 to 20
 0.0002  0.0004  0.0002  0.0002  0.0002  0.0002  0.0002  0.0010  0.0002  0.0002

Columns 21 to 30
 0.0004  0.0001  0.4397  0.0002  0.0002  0.0002  0.0002  0.0007  0.0002  0.0006

Columns 31 to 40
 0.0002  0.0002  0.0002  0.0002  0.0004  0.0002  0.0002  0.0002  0.0003  0.0002

Columns 41 to 50
 0.3504  0.1890  0.0002  0.0003  0.0002  0.0003  0.0002  0.0011  0.0002  0.0003
[torch.DoubleTensor of size 1x50]



In [219]:
-- Saving the current best model
myFile = hdf5.open('pred_test_mixed_cb', 'w')
myFile:write('distribution', distribution_mixed_test)
myFile:close()

# Accuracy

In [12]:
m, true_output = validation_output:max(2)

In [13]:
function compute_accuracy(pred, true_output)
    max,argmax = pred:max(2)
    acc = 0
    for i = 1, true_output:size(1) do
        if argmax[i][1] == true_output[i][1] then
            acc = acc + 1
        end
    end
    score = acc/true_output:size(1)
    
    return score
end

In [14]:
print('Result on alpha smoothing ', compute_accuracy(distribution_2, true_output))


Result on alpha smoothing 	0.59614243323442	
