In [1]:
require 'nn'
require 'hdf5'

# Loading data

In [2]:
-- Train format is (number of Ngrams, Ngram_size + 1) with last
-- col the count of the N_gram of the line

-- Validation format is (number of words to predict, 50 + Ngrams_size -1)
-- where the 50 columns stands for the 50 words possibilities in the prediction,
-- the next col stands for the current context (goal is to predict the Nth word)

myFile = hdf5.open('1-grams.hdf5','r')
data = myFile:all()
train_1 = data['train']
myFile:close()

myFile = hdf5.open('2-grams.hdf5','r')
data = myFile:all()
train_2 = data['train']
validation_2 = data['valid']
validation_output = data['valid_output']
myFile:close()

myFile = hdf5.open('3-grams.hdf5','r')
data = myFile:all()
train_3 = data['train']
validation_3 = data['valid']
myFile:close()

myFile = hdf5.open('4-grams.hdf5','r')
data = myFile:all()
train_4 = data['train']
validation_4 = data['valid']
myFile:close()

In [None]:
myFile = hdf5.open('5-grams.hdf5','r')
data = myFile:all()
train_5 = data['train']
test_5 = data['test']
validation_5 = data['valid']
myFile:close()

In [11]:
myFile = hdf5.open('6-grams.hdf5','r')
data = myFile:all()
train_6 = data['train']
test_6 = data['test']
validation_output = data['valid_output']
validation_6 = data['valid']
myFile:close()

In [2]:
myFile = hdf5.open('6-grams.hdf5','r')
data = myFile:all()
train_7 = data['train']
test_7 = data['test']
validation_7 = data['valid']
myFile:close()

# Maximum Likelihood Estimation

In [3]:
-- Second version: reversing the order in F_c_w

function build_context_count(count_tensor)
    -- Ngram count (depend on w and context)
    -- {'index1-...-indexN-1': {'indexN' : count}}
    local F_c_w = {}
    -- F_c dict (independent of w, only context based)
    -- {index1-...-indexN-1 : count all words in c}
    local F_c = {}
    -- N_c dict (independent of w, only context based)
    -- {index1-...-indexN-1 : count unique type of words in c}
    local N_c = {}

    local N = count_tensor:size(1)
    local M = count_tensor:size(2)

    for i=1, N do
        indexN = count_tensor[{i,M-1}]
        
        -- build the key index1-...-indexN-1
        indexes = tostring(count_tensor[{i,1}])
        for j=2, M - 2 do
            indexes = indexes .. '-' .. tostring(count_tensor[{i,j}])
        end
        
        -- Filling F_c_w
        if F_c_w[indexes] == nil then
            F_c_w[indexes] = {[indexN] = count_tensor[{i, M}]}
        else
            F_c_w[indexes][indexN] = count_tensor[{i, M}]
        end
        
        -- Updating F_c and F_c
        if F_c[indexes] == nil then
            F_c[indexes] = count_tensor[{i, M}]
            N_c[indexes] = 1
        else
            F_c[indexes] = count_tensor[{i, M}] + F_c[indexes]
            N_c[indexes] = 1 + N_c[indexes]
        end
    end
    
    return F_c_w, F_c, N_c
end

In [4]:
-- Prediction with the MLE (with Laplace smoothing)

function mle_proba(data, F_c_w, alpha)
    local N = data:size(1)
    local M = data:size(2)
    -- Output format: distribution predicted for each N word along the
    -- 50 possibilities
    local distribution = torch.DoubleTensor(N, 50)

    for i=1, N do
        -- build the key index1-...-indexN-1
        indexes = tostring(data[{i,51}])
        for j=52, M do
            indexes = indexes .. '-' .. tostring(data[{i,j}])
        end
        
        -- Look up in the dictionnary for the 50 possible ngrams asked
        for j=1, 50 do
            indexN = data[{i,j}]
            if F_c_w[indexN] == nil or F_c_w[indexN][indexes] == nil then
                distribution[{i,j}] = alpha
            else
                distribution[{i,j}] = F_c_w[indexN][indexes] + alpha
            end
        end
        -- Debug: case where no n-gram were found (only when alpha=0.)
        if distribution:narrow(1,i,1):sum(2)[{1,1}] == 0 then
            -- Select the first one (most common)
            distribution[{i,1}] = 1
        end
    end
    -- normalization (ie we do the MLE given only the 50 possibilities)
    distribution:cdiv(torch.expand(distribution:sum(2), N, 50))
    
    return distribution
end

In [5]:
-- Loading train of the gram_size N
function get_train(N)
    local filename = N .. '-grams.hdf5'
    --print(filename)
    myFile = hdf5.open(filename,'r')
    train = myFile:all()['train']
    myFile:close()
    return train
end

In [6]:
-- Prediction with the MLE (with Laplace smoothing) and fix on never seen context

function mle_proba_2(data, N, alpha)
    local N_data = data:size(1)
    local M = data:size(2)
    -- Output format: distribution predicted for each N_data word along the
    -- 50 possibilities
    local distribution = torch.DoubleTensor(N_data, 50)
    local gram_size
    
    -- Building the count matrix for each ngram size lower than N,
    -- if need to reduce the context in case of unseen new context
    local F_c_w_table = {}
    for i=1,N do
        train = get_train(i)
        F_c_w, F_c, N_c = build_context_count_2(train)
        F_c_w_table[i] = F_c_w
    end

    for i=1, N_data do
        -- Initialize the Ngram_size
        gram_size = N
        -- build the key index1-...-indexN-1
        indexes = tostring(data[{i,51}])
        for j=52, M do
            indexes = indexes .. '-' .. tostring(data[{i,j}])
        end
        -- case context never seen before:
        -- we look for the reduced context (ie index2-...-indexN-1)
        if F_c_w_table[N][indexes] == nil then
            -- Look for previous context
            for k = N-1,1,-1 do
                -- update gram_size
                gram_size = k
                -- Building index in reduced context
                indexes_split = indexes:split("-")
                indexes = ''
                for i= 1 + N - gram_size, N-1 do
                    indexes = indexes .. indexes_split[i]
                end
                -- look if current context seen
                if F_c_w_table[gram_size][indexes] ~= nil then
                    break
                end
            end 
        end

        -- Look up in the dictionnary for the 50 possible ngrams asked
        for j=1, 50 do
            indexN = data[{i,j}]
            if gram_size == 1 then
                -- Format of indexes is always string
                indexes = tostring(indexN)
            end
            -- case word never seen in that context: smoothing only
            if F_c_w_table[gram_size][indexes][indexN] == nil then
                distribution[{i,j}] = alpha
            else
                distribution[{i,j}] = F_c_w_table[gram_size][indexes][indexN] + alpha
            end
        end

        -- case uniform prediction (because no word present in current context)
        if distribution:narrow(1,i,1):sum(2)[{1,1}] == 50 * alpha then
            -- TODO: try with previous context
            -- Select the first one (most common)
            distribution[{i,1}] = 1 + alpha
            --print('here', i, j, gram_size, indexes, indexN)
        end
    end
    -- normalization (ie we do the MLE given only the 50 possibilities)
    distribution:cdiv(torch.expand(distribution:sum(2), N_data, 50))
    
    return distribution
end

In [7]:
function perplexity(distribution, true_words)
    -- exp of the average of the cross entropy of the true word for each line
    -- true words (N_words to predict, one hot true value among 50)
    local perp = 0
    local N = true_words:size(1)
    for i = 1,N do
        mm,aa = true_words[i]:max(1)
        perp = perp + math.log(distribution[{i, aa[1]}])
    end
    perp = math.exp(- perp/N)
    return perp
end

In [8]:
-- Test of new mle version
distribution_mle_2_new = mle_proba_2(validation_2, 2, 1)

[string "-- Prediction with the MLE (with Laplace smoo..."]:4: attempt to index local 'data' (a nil value)
stack traceback:
	[string "-- Prediction with the MLE (with Laplace smoo..."]:4: in function 'mle_proba_2'
	[string "-- Test of new mle version..."]:2: in main chunk
	[C]: in function 'xpcall'
	...colasdrizard/torch/install/share/lua/5.1/itorch/main.lua:179: in function <...colasdrizard/torch/install/share/lua/5.1/itorch/main.lua:143>
	...colasdrizard/torch/install/share/lua/5.1/lzmq/poller.lua:75: in function 'poll'
	...asdrizard/torch/install/share/lua/5.1/lzmq/impl/loop.lua:307: in function 'poll'
	...asdrizard/torch/install/share/lua/5.1/lzmq/impl/loop.lua:325: in function 'sleep_ex'
	...asdrizard/torch/install/share/lua/5.1/lzmq/impl/loop.lua:370: in function 'start'
	...colasdrizard/torch/install/share/lua/5.1/itorch/main.lua:350: in main chunk
	[C]: in function 'require'
	(command line):1: in main chunk
	[C]: at 0x01075eebb0: 

In [34]:
print('Result on alpha smoothing 2grams', perplexity(distribution_mle_2_new, validation_output))

Result on alpha smoothing 2grams	8.8674198826233	


In [88]:
F_c_w, F_c, N_c = build_context_count(train_2)
distribution_mle_2 = mle_proba(validation_2, F_c_w, 1)
print('Result on alpha smoothing 2grams', perplexity(distribution_mle_2, validation_output))

Result on alpha smoothing 2grams	8.8680603647573	


In [38]:
-- Test of new mle version
distribution_mle_3_new = mle_proba_2(validation_3, 3, 1)
print('Result on alpha smoothing 2grams', perplexity(distribution_mle_3_new, validation_output))

1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


Result on alpha smoothing 2grams	17.059968522337	


In [10]:
F_c_w_table = {}
for i=1,3 do
    train = get_train(i)
    F_c_w, F_c, N_c = build_context_count_2(train)
    F_c_w_table[i] = F_c_w
end

1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


In [11]:
validation_2:narrow(1, 3, 1)

Columns 1 to 13
  500   305  8425  5919  4090  4981  8611  8244  4804  2009  9910   471     6

Columns 14 to 26
 9904  1198   514  1061  5049  6289  8585  1697  4648  3676  7371  1867  4857

Columns 27 to 39
 3924  6939  8431  3086  4288  3138  4486  3949  3730  8607  7310  9582  9873

Columns 40 to 51
 6426  7482  1215  6676  9535  1417  6268  7276  1861  4232  7448  3884
[torch.LongTensor of size 1x51]



In [92]:
F_c_w_table[1]['6594']

{
  6594 : 10
}


In [7]:
F_c_w, F_c, N_c = build_context_count(train_3)
distribution_mle_3 = mle_proba(validation_3, F_c_w, 1)

In [8]:
F_c_w, F_c, N_c = build_context_count(train_4)
distribution_mle_4 = mle_proba(validation_4, F_c_w, 1)

In [11]:
-- Results on the validation set
print('Result on alpha smoothing 2grams', perplexity(distribution_mle_2, validation_output))
print('Result on alpha smoothing 3grams', perplexity(distribution_mle_3, validation_output))
print('Result on alpha smoothing 4grams', perplexity(distribution_mle_4, validation_output))

Result on alpha smoothing 2grams	8.8680603647573	


Result on alpha smoothing 3grams	21.513973503713	


Result on alpha smoothing 4grams	29.942969799245	


# Witten Bell Model

In [8]:
function compute_wb_line(N, entry, F_c_w_table, F_c_table, N_c_table, alpha)
    -- Compute the interpolated Witten-Bell model where we jump tp lower
    -- order models if the context count is 0 or all the words counts in that
    -- context is 0 also.
    --
    -- Return vector (50) predicting the distribution from entry
    -- N represent the Ngram size used in the prediction so context is N-1 gram
    -- alpha is only used for the MLE without any context
    local prediction = torch.zeros(50)
    local indexN
    -- case where computation only on the prior
    if N == 1 then
        for j=1,50 do
            indexN = entry[{1, j}]
            -- Corner case when prediction on words not on the dict (case for <s>)
            if F_c_w_table[1][tostring(indexN)] == nil then
                prediction[j] = 0
            else
                prediction[j] = F_c_w_table[1][tostring(indexN)][indexN] + alpha
            end
        end
        -- Normalizing
        return prediction:div(prediction:sum(1)[1])
    else
        -- Compute the MLE for current N
        -- context (at least with one element)
        local indexes = tostring(entry[{1, entry:size(2)}])
        for j=entry:size(2) - 1, entry:size(2) - 1 - (N-3), -1 do
            indexes = tostring(entry[{1, j}]) .. '-' .. indexes
        end
        -- check if context is unseen, otherwise go to next context
        if F_c_w_table[N][indexes] == nil then
            --print('unseen context')
            return compute_wb_line(N-1, entry, F_c_w_table, F_c_table, N_c_table, alpha)
        end
        -- Compute MLE for each word
        for j=1,50 do
            indexN = entry[{1, j}]
            if F_c_w_table[N][indexes][indexN] ~= nil then
                prediction[j] = F_c_w_table[N][indexes][indexN]
            end
        end
        
        -- Check that MLE predicted at least one words, otherwise go to next context
        if prediction:sum(1)[1] == 0 then
            --print('unseen words')
            return compute_wb_line(N-1, entry, F_c_w_table, F_c_table, N_c_table, alpha)
        end
        
        -- Combining with next context
        prediction:add(compute_wb_line(N-1, entry, F_c_w_table, F_c_table, N_c_table, alpha):mul(N_c_table[N][indexes])):div(N_c_table[N][indexes] + F_c_table[N][indexes])
        -- Normalization
        -- TODO: We normalize as we apply on a reduced dataest (50 words)
        -- prediction:div(prediction:sum(1)[1])
        return prediction
    end
end

In [9]:
-- Witten Bell: new version, computation done at once line by line
--
-- p_wb(w|c) = (F_c_w + N_c_. * p_wb(w|c'))/(N_c_. + F_c_.)
function distribution_proba_WB(N, data, alpha)
    local N_data = data:size(1)
    local M = data:size(2)

    -- Building the count matrix for each ngram size lower than N.
    local F_c_w_table = {}
    local F_c_table = {}
    local N_c_table = {}
    for i=1,N do
        train = get_train(i)
        F_c_w_table[i], F_c_table[i], N_c_table[i] = build_context_count(train)
    end

    -- Vector initialisation
    local distribution = torch.zeros(N_data, 50)
    for i=1,N_data do
        -- Compute witten bell for the whole line i
        distribution:narrow(1, i, 1):copy(compute_wb_line(N, data:narrow(1,i,1), F_c_w_table, F_c_table, N_c_table, alpha))
    end
    distribution:cdiv(distribution:sum(2):expand(distribution:size(1), distribution:size(2)))
    return distribution
end

In [269]:
t = compute_wb_line(2, validation_3:narrow(1,6,1), F_c_w_table, F_c_table, N_c_table, 1)

unseen words	


In [11]:
distribution_2 = distribution_proba_WB(2, validation_2, 1)

1-grams.hdf5	


2-grams.hdf5	


In [14]:
-- Alpha validation ==> Best value is 3.5
alphas = {0, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.5, 5}

for i=1, #alphas do
    alpha = alphas[i]
    distribution = distribution_proba_WB(1, validation_2, alpha)
    print('Result on alpha smoothing 1grams, alpha '..alpha, perplexity(distribution, validation_output))
end

1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 0	6.023602585938	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 0.25	6.0169008937015	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 0.5	6.0114017902934	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 0.75	6.0068267370782	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 1.25	5.9997923223228	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 1.5	5.997120094739	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 1.75	5.9949125208341	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 2	5.9931153986828	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 2.25	5.9916848317703	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 2.5	5.990584520467	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 2.75	5.9897839308087	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 3	5.9892570123825	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 3.25	5.9889812736852	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 3.5	5.9889370981486	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 3.75	5.9891072269655	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 4	5.9894763604856	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 4.5	5.9907584281645	
1-grams.hdf5	


Result on alpha smoothing 1grams, alpha 5	5.9926896910852	


In [24]:
-- Without alpha
distribution_1 = distribution_proba_WB(1, validation_2, 0)
print('Result on alpha smoothing 1grams', perplexity(distribution_1, validation_output))
distribution_2 = distribution_proba_WB(2, validation_2, 0)
print('Result on alpha smoothing 2grams', perplexity(distribution_2, validation_output))
distribution_3 = distribution_proba_WB(3, validation_3, 0)
print('Result on alpha smoothing 3grams', perplexity(distribution_3, validation_output))
distribution_4 = distribution_proba_WB(4, validation_4, 0)
print('Result on alpha smoothing 4grams', perplexity(distribution_4, validation_output))

1-grams.hdf5	


Result on alpha smoothing 1grams	6.023602585938	
1-grams.hdf5	


2-grams.hdf5	


Result on alpha smoothing 2grams	5.0433818281195	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


Result on alpha smoothing 3grams	4.5689460403288	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


Result on alpha smoothing 4grams	4.4189330367685	


In [26]:
-- ALpha is 1
distribution_1 = distribution_proba_WB(1, validation_2, 1)
print('Result on alpha smoothing 1grams', perplexity(distribution_1, validation_output))
distribution_2 = distribution_proba_WB(2, validation_2, 1)
print('Result on alpha smoothing 2grams', perplexity(distribution_2, validation_output))
distribution_3 = distribution_proba_WB(3, validation_3, 1)
print('Result on alpha smoothing 3grams', perplexity(distribution_3, validation_output))
distribution_4 = distribution_proba_WB(4, validation_4, 1)
print('Result on alpha smoothing 4grams', perplexity(distribution_4, validation_output))

1-grams.hdf5	


Result on alpha smoothing 1grams	6.0029978880708	
1-grams.hdf5	


2-grams.hdf5	


Result on alpha smoothing 2grams	5.026210590868	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


Result on alpha smoothing 3grams	4.5520551448742	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


Result on alpha smoothing 4grams	4.4020577683054	


In [28]:
distribution_5 = distribution_proba_WB(5, validation_5, 1)
print('Result on alpha smoothing 5grams', perplexity(distribution_5, validation_output))

1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


5-grams.hdf5	


Result on alpha smoothing 4grams	4.3572276127565	


In [32]:
distribution_6 = distribution_proba_WB(6, validation_6, 1)
print('Result on alpha smoothing 6grams', perplexity(distribution_6, validation_output))

1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


5-grams.hdf5	


6-grams.hdf5	


Result on alpha smoothing 4grams	4.3336779743836	


In [15]:
-- ALpha is the optimized one from validation on perplexity
distribution_1 = distribution_proba_WB(1, validation_2, 3.5)
print('Result on alpha smoothing 1grams', perplexity(distribution_1, validation_output))
distribution_2 = distribution_proba_WB(2, validation_2, 3.5)
print('Result on alpha smoothing 2grams', perplexity(distribution_2, validation_output))
distribution_3 = distribution_proba_WB(3, validation_3, 3.5)
print('Result on alpha smoothing 3grams', perplexity(distribution_3, validation_output))
distribution_4 = distribution_proba_WB(4, validation_4, 3.5)
print('Result on alpha smoothing 4grams', perplexity(distribution_4, validation_output))

1-grams.hdf5	


Result on alpha smoothing 1grams	5.9889370981486	
1-grams.hdf5	


2-grams.hdf5	


Result on alpha smoothing 2grams	5.0114654001341	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


Result on alpha smoothing 3grams	4.5351174868265	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


Result on alpha smoothing 4grams	4.3843212693213	


In [17]:
distribution_5 = distribution_proba_WB(5, validation_5, 3.5)
print('Result on alpha smoothing 5grams', perplexity(distribution_5, validation_output))
distribution_6 = distribution_proba_WB(6, validation_6, 3.5)
print('Result on alpha smoothing 6grams', perplexity(distribution_6, validation_output))

1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


5-grams.hdf5	


Result on alpha smoothing 5grams	4.33902739378	
1-grams.hdf5	


2-grams.hdf5	


3-grams.hdf5	


4-grams.hdf5	


5-grams.hdf5	


6-grams.hdf5	


Result on alpha smoothing 6grams	4.3152478550508	


In [12]:
distribution_6 = distribution_proba_WB(6, validation_6, 3.5)
print('Result on alpha smoothing 6grams', perplexity(distribution_6, validation_output))

Result on alpha smoothing 6grams	4.0878099970876	


In [17]:
distribution_6_bis = distribution_proba_WB(6, validation_6, 5.5)
print('Result on alpha smoothing 6grams', perplexity(distribution_6_bis, validation_output))

Result on alpha smoothing 6grams	4.0815321835969	


In [20]:
distribution = distribution_proba_WB(6, validation_6, 7)
print('Result on alpha smoothing 6grams', perplexity(distribution, validation_output))

Result on alpha smoothing 6grams	4.0778034534132	


In [21]:
distribution = distribution_proba_WB(6, validation_6, 8)
print('Result on alpha smoothing 6grams', perplexity(distribution, validation_output))

Result on alpha smoothing 6grams	4.077354417045	


# Modified Kneser-Ney smoothing

In [13]:
-- Version tailored for modified Kneser-Ney

function build_context_count_split(count_tensor, K)
    -- count_tensor in format (N_words, N + 1):
    -- col1, ..., colN = indexes for the Ngram, colN+1 = N_gram count
    -- K: number of count separate cases (need K > 1, usually K = 3)
    --
    -- Ngram count (depend on w and context)
    -- {'index1-...-indexN-1': {'indexN' : count}}
    local F_c_w = {}
    -- F_c dict (independent of w, only context based)
    -- {index1-...-indexN-1 : count all words in c}
    local F_c = {}
    -- N_c dict (independent of w, only context based)
    -- {k: {index1-...-indexN-1 : # words with count k in c}}
    local N_c_split = {}
    for j=1,K do
        N_c_split[j] = {}
    end
    -- n_table: stores the total number of N_grams with exact number of occurences
    -- stored in their key k: # N_grams with exactly k occurences
    local n_table = {}
    for j=1,K+1 do
        n_table[j] = 0
    end

    local N = count_tensor:size(1)
    local M = count_tensor:size(2)

    for i=1, N do
        local indexN = count_tensor[{i,M-1}]
        
        -- build the key index1-...-indexN-1
        indexes = tostring(count_tensor[{i,1}])
        for j=2, M - 2 do
            indexes = indexes .. '-' .. tostring(count_tensor[{i,j}])
        end
        
        -- Filling F_c_w
        if F_c_w[indexes] == nil then
            F_c_w[indexes] = {[indexN] = count_tensor[{i, M}]}
        else
            F_c_w[indexes][indexN] = count_tensor[{i, M}]
        end
        
        -- Building the key to update the corresponding part of N_c_split
        if count_tensor[{i, M}] > K then
            key_N_c = K
        else
            key_N_c = count_tensor[{i, M}]
        end
        
        -- Updating N_c_split
        if N_c_split[key_N_c][indexes] == nil then
            N_c_split[key_N_c][indexes] = 1
        else
            N_c_split[key_N_c][indexes] = 1 + N_c_split[key_N_c][indexes]
        end
        
        -- Updating F_c
        if F_c[indexes] == nil then
            F_c[indexes] = count_tensor[{i, M}]
        else
            F_c[indexes] = count_tensor[{i, M}] + F_c[indexes]
        end
        
        -- Updating n_table
        if count_tensor[{i, M}] <= K + 1 then
            n_table[count_tensor[{i, M}]] = n_table[count_tensor[{i, M}]] + 1
        end
    end
    
    -- Compute the D term from n_table
    local Y = n_table[1]/(n_table[1] + 2*n_table[2])
    local D = {}
    for k=1,K do
        D[k] = k - (1 + k)*Y*n_table[1 + k]/n_table[k]
    end
    -- Debugg
    -- print(D)
    return F_c_w, F_c, N_c_split, D
end

In [14]:
function compute_mkn_line(N, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K)
    -- Compute the Modified Kneser Ney model where we jump to lower
    -- order models if the context count is 0 or all the words counts in that
    -- context is 0 also.
    --
    -- Return vector (50) predicting the distribution from entry
    -- N represent the Ngram size used in the prediction so context is N-1 gram
    -- alpha is only used for the MLE without any context
    local prediction = torch.zeros(50)
    local indexN
    -- case where computation only on the prior
    if N == 1 then
        for j=1,50 do
            indexN = entry[{1, j}]
            -- Corner case when prediction on words not on the dict (case for <s>)
            if F_c_w_table[1][tostring(indexN)] == nil then
                prediction[j] = 0
            else
                prediction[j] = F_c_w_table[1][tostring(indexN)][indexN] + alpha
            end
        end
        -- Normalizing
        return prediction:div(prediction:sum(1)[1])
    else
        -- Compute the MLE for current N
        -- context (at least with one element)
        local indexes = tostring(entry[{1, entry:size(2)}])
        for j=entry:size(2) - 1, entry:size(2) - 1 - (N-3), -1 do
            indexes = tostring(entry[{1, j}]) .. '-' .. indexes
        end
        -- check if context is unseen, otherwise go to next context
        if F_c_w_table[N][indexes] == nil then
            --print('unseen context')
            return compute_mkn_line(N-1, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K)
        end
        -- Compute curent order level with modified absolute discouting for each word
        for j=1,50 do
            indexN = entry[{1, j}]
            -- case word seen
            if F_c_w_table[N][indexes][indexN] ~= nil then
                -- Building the key for the different case of absolute discounting
                if F_c_w_table[N][indexes][indexN] > K then
                    key_N_c = K
                else
                    key_N_c = F_c_w_table[N][indexes][indexN]
                end
                prediction[j] = F_c_w_table[N][indexes][indexN] - D_table[N][key_N_c]
            end
        end
        
        -- Check that MLE predicted at least one words, otherwise go to next context
        if prediction:sum(1)[1] == 0 then
            --print('unseen words')
            return compute_mkn_line(N-1, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K)
        end
        
        -- Computing factor of lower order model (no denominator because we normalize afterwards)
        local gamma = 0
        for k=1,K do
            if N_c_split_table[N][k][indexes] ~= nil then
                gamma = gamma + D_table[N][k]*N_c_split_table[N][k][indexes]
            end
        end
        if gamma == 0 then
            print('gamma error')
        end
        -- Combining with next context
        prediction:add(compute_mkn_line(N-1, entry, F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K):mul(gamma)):div(F_c_table[N][indexes])
        -- Normalization
        -- TODO: why??? We normalize at the end
        -- prediction:div(prediction:sum(1)[1])
        return prediction
    end
end

In [15]:
-- Modified Kneser Ney: computation done at once line by line
--
-- p_wb(w|c) = (F_c_w + N_c_. * p_wb(w|c'))/(N_c_. + F_c_.)
function distribution_proba_mKN(N, data, alpha, K)
    local N_data = data:size(1)
    local M = data:size(2)

    -- Building the count matrix for each ngram size lower than N.
    local F_c_w_table = {}
    local F_c_table = {}
    local N_c_split_table = {}
    local D_table = {}
    for i=1,N do
        train = get_train(i)
        F_c_w_table[i], F_c_table[i], N_c_split_table[i], D_table[i] = build_context_count_split(train, K)
    end

    -- Vector initialisation
    local distribution = torch.zeros(N_data, 50)
    for i=1,N_data do
        -- Compute witten bell for the whole line i
        distribution:narrow(1, i, 1):copy(compute_mkn_line(N, data:narrow(1,i,1), F_c_w_table, F_c_table, N_c_split_table, D_table, alpha, K))
    end
    distribution:cdiv(distribution:sum(2):expand(distribution:size(1), distribution:size(2)))
    return distribution
end

In [54]:
-- DEBUG
K = 3
F_c_w_table = {}
F_c_table = {}
N_c_split_table = {}
D_table = {}
n_table = {}
for i=1,2 do
    train = get_train(i)
    F_c_w_table[i], F_c_table[i], N_c_split_table[i], D_table[i] = build_context_count_split(train, K)
end

1-grams.hdf5	


{
  1 : 0.51724137931034
  2 : -0.77093596059113
  3 : -2.9586206896552
}
2-grams.hdf5	


{
  1 : 0.71157945361249
  2 : 1.0881976981224
  3 : 1.446606938822
}


In [83]:
test = compute_mkn_line(2, validation_2:narrow(1, 3, 1), F_c_w_table, F_c_table, N_c_split_table, D_table, 1, 3)

unseen words	


In [109]:
distribution_4 = distribution_proba_mKN(4, validation_4, 1, 3)

1-grams.hdf5	


{
  1 : 0.51724137931034
  2 : -0.77093596059113
  3 : -2.9586206896552
}
2-grams.hdf5	


{
  1 : 0.71157945361249
  2 : 1.0881976981224
  3 : 1.446606938822
}
3-grams.hdf5	


{
  1 : 0.84848390111054
  2 : 1.2170130335274
  3 : 1.3775282874792
}
4-grams.hdf5	


{
  1 : 0.92110201391759
  2 : 1.2968014992496
  3 : 1.2971783075115
}


In [117]:
-- Cross validation on alpha on higher ngram size

-- Alpha validation ==> Best value is 3.5
alphas = {0, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 3.75, 4, 4.5, 5}
K = 3
for i=1, #alphas do
    alpha = alphas[i]
    distribution_2 = distribution_proba_mKN(2, validation_2, alpha, K)
    print('Result on alpha smoothing 2grams, alpha '..alpha, perplexity(distribution_2, validation_output))
    distribution_4 = distribution_proba_mKN(4, validation_4, alpha, K)
    print('Result on alpha smoothing 4grams, alpha '..alpha, perplexity(distribution_4, validation_output))
end

Result on alpha smoothing 2grams, alpha 0	5.1405210180739	


Result on alpha smoothing 4grams, alpha 0	4.3017358601582	


Result on alpha smoothing 2grams, alpha 0.25	5.1345908364747	


Result on alpha smoothing 4grams, alpha 0.25	4.2960411418246	


Result on alpha smoothing 2grams, alpha 0.5	5.1296366428649	


Result on alpha smoothing 4grams, alpha 0.5	4.2911559318662	


Result on alpha smoothing 2grams, alpha 0.75	5.1254257965317	


Result on alpha smoothing 4grams, alpha 0.75	4.2868861013792	


Result on alpha smoothing 2grams, alpha 1.25	5.1186904989754	


Result on alpha smoothing 4grams, alpha 1.25	4.2797397749426	


Result on alpha smoothing 2grams, alpha 1.5	5.1159912624387	


Result on alpha smoothing 4grams, alpha 1.5	4.2767180019646	


Result on alpha smoothing 2grams, alpha 1.75	5.1136572787308	


Result on alpha smoothing 4grams, alpha 1.75	4.2739971781502	


Result on alpha smoothing 2grams, alpha 2	5.1116447378085	


Result on alpha smoothing 4grams, alpha 2	4.2715410478424	


Result on alpha smoothing 2grams, alpha 2.25	5.1099183414695	


Result on alpha smoothing 4grams, alpha 2.25	4.269320445975	


Result on alpha smoothing 2grams, alpha 2.5	5.1084490354619	


Result on alpha smoothing 4grams, alpha 2.5	4.2673114027822	


Result on alpha smoothing 2grams, alpha 2.75	5.1072124819481	


Result on alpha smoothing 4grams, alpha 2.75	4.2654938680566	


Result on alpha smoothing 2grams, alpha 3	5.1061879941292	


Result on alpha smoothing 4grams, alpha 3	4.2638508220318	


Result on alpha smoothing 2grams, alpha 3.25	5.105357770893	


Result on alpha smoothing 4grams, alpha 3.25	4.2623676371534	


Result on alpha smoothing 2grams, alpha 3.5	5.10470633285	


Result on alpha smoothing 4grams, alpha 3.5	4.2610316081853	


Result on alpha smoothing 2grams, alpha 3.75	5.1042200975229	


Result on alpha smoothing 4grams, alpha 3.75	4.2598315985707	


Result on alpha smoothing 2grams, alpha 4	5.1038870531415	


Result on alpha smoothing 4grams, alpha 4	4.2587577691279	


Result on alpha smoothing 2grams, alpha 4.5	5.1036388679001	


Result on alpha smoothing 4grams, alpha 4.5	4.2569545548285	


Result on alpha smoothing 2grams, alpha 5	5.1038886350349	


Result on alpha smoothing 4grams, alpha 5	4.2555621716216	


In [121]:
-- Cross validation on K

-- Alpha validation ==> Best value is 3.5
Ks = {2, 3, 4, 5}
--alphas = {0, 2, 3.5, 5, 7}
for i=1, #Ks do
    K = Ks[i]
    distribution_4 = distribution_proba_mKN(4, validation_4, 3.5, K)
    print('Result on 4grams, alpha 3.5, K'..K, perplexity(distribution_4, validation_output))
    distribution_4_bis = distribution_proba_mKN(4, validation_4, 5.5, K)
    print('Result on 4grams, alpha 5.5, K'..K, perplexity(distribution_4_bis, validation_output))
    distribution_4_ter = distribution_proba_mKN(4, validation_4, 7, K)
    print('Result on 4grams, alpha 7, K'..K, perplexity(distribution_4_ter, validation_output))
end

Result on 4grams, alpha 3.5, K2	4.2193148722749	


Result on 4grams, alpha 5.5, K2	4.2125914837561	


Result on 4grams, alpha 7, K2	4.2110764358717	


Result on 4grams, alpha 3.5, K3	4.2610316081853	


Result on 4grams, alpha 5.5, K3	4.2545317703483	


Result on 4grams, alpha 7, K3	4.2532371481075	


Result on 4grams, alpha 3.5, K4	4.2868918144915	


Result on 4grams, alpha 5.5, K4	4.2805566409014	


Result on 4grams, alpha 7, K4	

4.2794163213392	


Result on 4grams, alpha 3.5, K5	4.3102393822349	


Result on 4grams, alpha 5.5, K5	4.3040497343289	


Result on 4grams, alpha 7, K5	4.3030462686139	


In [16]:
-- Test on current best configuration with 6-grams (alpha = 7, K = 2)
distribution = distribution_proba_mKN(6, validation_6, 7, 2)
print('Result on 6grams, alpha 7, K 2', perplexity(distribution, validation_output))

Result on 6grams, alpha 7, K 2	4.2067088977711	


# Applying on test

In [23]:
-- Applying on test
distribution_test = distribution_proba_WB(6, test_6, 8)

In [24]:
-- Saving the current best model
myFile = hdf5.open('pred_test_wb_fnorm_6', 'w')
myFile:write('distribution', distribution_test)
myFile:close()

# Accuracy

In [12]:
m, true_output = validation_output:max(2)

In [13]:
function compute_accuracy(pred, true_output)
    max,argmax = pred:max(2)
    acc = 0
    for i = 1, true_output:size(1) do
        if argmax[i][1] == true_output[i][1] then
            acc = acc + 1
        end
    end
    score = acc/true_output:size(1)
    
    return score
end

In [14]:
print('Result on alpha smoothing ', compute_accuracy(distribution_2, true_output))


Result on alpha smoothing 	0.59614243323442	
