In [1]:
require 'hdf5'

{
  H5Z_FILTER_CONFIG_ENCODE_ENABLED : 1
  H5F_ACC_RDWR : 1
  _getTorchType : function: 0x0ada4488
  H5F_OBJ_FILE : 1
  H5S_ALL : 0
  H5F_OBJ_GROUP : 4
  C : userdata: 0x0ada4290
  H5P_DEFAULT : 0
  _describeObject : function: 0x0ad9f388
  H5Z_FILTER_NBIT : 5
  _debugMode : false
 

 _getObjectType : function: 0x0ad9f348
  H5F_OBJ_ALL : 31
  _getObjectName : function: 0x0ada44a8
  version : 
    {
      1 : 1
      2 : 8
      3 : 15
    }
  H5Z_FILTER_SHUFFLE : 2
  HDF5Group : table: 0x0a9a18c8
  open : function: 0x0aa21570
  H5Z_FILTER_SZIP : 4
  H5F_OBJ_ATTR : 16
  H5Z_FILTER_FLETCHER32 : 3
  H5F_OBJ_DATATYPE : 8
  debugMode : function: 0x0aa214f0
  H5F_ACC_EXCL : 4
  H5Z_FILTER_NONE : 0
  _testUtils : 
    {
      deepAlmostEq : function: 0x0aa214b0
      withTmpDir : function: 0x0aa21410
    }
  _nativeTypeForTensorType : function: 0x0adaa498
  H5F_ACC_TRUNC : 2
  _config : 
    {
      HDF5_INCLUDE_PATH : /Users/nicolasdrizard/anaconda/include
      HDF5_LIBRARIES : /Users/nicolasdrizard/anaconda/lib/libhdf5.dylib;/Users/nicolasdrizard/anaconda/lib/libhdf5_hl.dylib;/Users/nicolasdrizard/anaconda/lib/libhdf5.dylib;/Users/nicolasdrizard/anaconda/lib/libz.dylib;/usr/lib/libdl.dylib;/usr/lib/libm.dylib
    }
  _loadObject : function: 0x0aa21550
  DataSetOptions 

692
      NATIVE_UINT_LEAST8 : 50331667
      COMPOUND : 6
      STD_REF_OBJ : 50331739
      NATIVE_UINT_FAST16 : 50331675
      NATIVE_B64 : 50331696
      STD_U16BE : 50331719
      STD_REF_DSETREG : 50331740
      NATIVE_B8 : 50331693
      STD_I32BE : 50331713
      IEEE_F64BE : 50331705
      NATIVE_FLOAT : 50331690
      NATIVE_UCHAR : 50331657
      STD_U32BE : 50331721
      OPAQUE : 5
      STD_B64BE : 50331731
      STD_U8LE : 50331716
      STD_I16BE : 50331711
      STD_B8BE : 50331725
      STRING : 3
      STD_I16LE : 50331710
      STD_U64BE : 50331723
    }
  HDF5File : table: 0x0aa129b8
  H5Z_FILTER_SCALEOFFSET : 6
  H5Z_FILTER_DEFLATE : 1
  H5F_ACC_CREAT : 16
  H5F_UNLIMITED : 18446744073709551615ULL
  _deflateAvailable : function: 0x0aa0a310
  _inDebugMode : function: 0x0aa21530
  H5F_OBJ_LOCAL : 32
  H5S_SELECT_SET : 0
  _datatypeName : function: 0x0ada2f60
  H5F_ACC_DEBUG : 8
  H5F_OBJ_DATASET : 2
}


In [2]:
-- Loading data
myFile = hdf5.open('../data/words_feature.hdf5','r')
data = myFile:all()
emission = data['emission']
transition = data['transition']
input_matrix_train = data['input_matrix_train']
input_matrix_dev = data['input_matrix_dev']
input_matrix_test = data['input_matrix_test']
myFile:close()

In [3]:
-- Formating as log-probability and smoothing the input
function format_matrix(matrix, alpha)
    local formatted_matrix = matrix:clone()
    formatted_matrix:add(alpha)
    -- Normalize
    local norm_mat = torch.Tensor(torch.expandAs(formatted_matrix:sum(1), formatted_matrix))
    formatted_matrix:cdiv(norm_mat)
    return formatted_matrix:log()
end
    
-- log-scores of transition and emission
-- corresponds to the vector y in the lecture notes
-- i: timestep for the computed score
function score_hmm(observations, i, emission, transition, C)
    local observation_emission = emission[observations[i]]:view(C, 1):expand(C, C)
    -- NOTE: allocates a new Tensor
    return observation_emission + transition
end

-- Viterbi algorithm.
-- observations: a sequence of observations, represented as integers
-- logscore: the edge scoring function over classes and observations in a history-based model
function viterbi(observations, logscore, emission, transition)
    local y
    -- Formating tensors
    local initial = torch.zeros(transition:size(2), 1)
    -- initial started with a start of sentence: <t>
    initial[{8,1}] = 1
    initial:log()

    -- number of classes
    C = initial:size(1)
    local n = observations:size(1)
    local max_table = torch.Tensor(n, C)
    local backpointer_table = torch.Tensor(n, C)

    -- first timestep
    -- the initial most likely paths are the initial state distribution
    -- NOTE: another unnecessary Tensor allocation here
    local maxes, backpointers = (initial + emission[observations[1]]):max(2)
    max_table[1] = maxes

    -- remaining timesteps ("forwarding" the maxes)
    for i=2,n do
        -- precompute edge scores
        y = logscore(observations, i, emission, transition, C)
        scores = y + maxes:view(1, C):expand(C, C)

        -- compute new maxes (NOTE: another unnecessary Tensor allocation here)
        maxes, backpointers = scores:max(2)

        -- record
        max_table[i] = maxes
        backpointer_table[i] = backpointers
    end
    -- follow backpointers to recover max path
    local classes = torch.Tensor(n)
    maxes, classes[n] = maxes:max(1)
    for i=n,2,-1 do
        classes[i-1] = backpointer_table[{i, classes[i]}]
    end

    return classes
end

In [16]:
-- function to evaluate the predicted sequence
-- need to compute precision and recall (class 1 stands for negative class)

function compute_score(predicted_classes, true_classes)
    local n = predicted_classes:size(1)
    local right_pred = 0
    local positive_true = 0
    local positive_pred = 0
    for i=1,n do
        if predicted_classes[i] > 1 then
            positive_pred = positive_pred + 1
        end
        if true_classes[i] > 1 then
            positive_true = positive_true + 1
        end
        if (true_classes[i] == predicted_classes[i]) and true_classes[i] > 1 then
            right_pred = right_pred + 1
        end
    end
    print(positive_true)
    print(positive_pred)
    print(right_pred)
    local precision = right_pred/positive_pred
    local recall = right_pred/positive_true
    return precision, recall
end
        
function f_score(predicted_classes, true_classes)
    local p,r = compute_score(predicted_classes, true_classes)
    return 2*p*r/(p+r)
end

In [289]:
-- Prediction pipeline
observations = input_matrix_train:narrow(2,3,1):clone():view(input_matrix_train:size(1))
alpha = 0
emission_cleaned = format_matrix(emission, alpha)
transition_cleaned = format_matrix(transition, alpha)

-- Prediction
v_seq = viterbi(observations, score_hmm, emission_cleaned, transition_cleaned)

-- Evaluation
true_classes = input_matrix_train:narrow(2,4,1):clone():view(input_matrix_train:size(1))
p,r = compute_score(v_seq, true_classes)
f = f_score(v_seq, true_classes)
print('Train')
print('f_score', f)
print('precision', p)
print('recall',r)

Train	
f_score	0.96813997519096	
precision	0.96776088233375	
recall	0.9685193651623	


In [5]:
-- Prediction pipeline
function predict(observations, emission, transition, alpha)
    -- Formating model parameters (log and alpha smoothing)
    local emission_cleaned = format_matrix(emission, alpha)
    local transition_cleaned = format_matrix(transition, alpha)

    return viterbi(observations, score_hmm, emission_cleaned, transition_cleaned)
end

In [302]:
-- Evaluation
alphas = {0.25, 0.5, 0.75, 1, 1.25, 1.5,1.75, 2}
true_classes = input_matrix_dev:narrow(2,4,1):clone():view(input_matrix_dev:size(1))
observations = input_matrix_dev:narrow(2,3,1):clone():view(input_matrix_dev:size(1))
f_tensor = torch.zeros(#alphas)

for i=1, #alphas do 
    alpha = alphas[i]
    v_seq = predict(observations, emission, transition, alpha)
    f_tensor[i] = f_score(v_seq, true_classes)
end


In [305]:
print(f_tensor)
m,a = f_tensor:max(1)
print(alphas[a[1]])

 0.7378
 0.7207
 0.7119
 0.7037
 0.6889
 0.7426
 0.7277
 0.7158
[torch.DoubleTensor of size 8]

1.5	


In [35]:
observations = input_matrix_dev:narrow(2,3,1):clone():view(input_matrix_dev:size(1))
v_seq = predict(observations, emission, transition, 1.5)
print(v_seq:narrow(1,1,50))

 8
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 9
 8
 3
 1
 9
 8
 2
 2
 2
 2
 2
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
[torch.DoubleTensor of size 50]



In [17]:
f_score(v_seq, true_classes)

7016	
4719	
4357	
0.74256497656583	


In [8]:
true_classes = input_matrix_dev:narrow(2,4,1):clone():view(input_matrix_dev:size(1))
print(true_classes:narrow(1,1,30))

 8
 1
 1
 4
 1
 1
 1
 1
 1
 1
 1
 1
 9
 8
 3
 1
 9
 8
 5
 5
 1
 2
 2
 1
 1
 1
 1
 1
 1
 1
[torch.LongTensor of size 30]



# Prediction on test (Kaggle format)

In [9]:
observations_test = input_matrix_test:narrow(2,3,1):clone():view(input_matrix_test:size(1))
v_seq_test = predict(observations_test, emission, transition, 1.5)

In [19]:
v_seq_test:narrow(1,1,20)

 8
 1
 1
 1
 1
 1
 1
 1
 9
 8
 3
 1
 9
 8
 1
 1
 1
 1
 1
 1
[torch.DoubleTensor of size 20]



In [37]:
-- Saving predicted sequence on test
myFile = hdf5.open('../submission/v_seq_test_hmm', 'w')
myFile:write('v_seq_test', v_seq_test)
myFile:write('v_seq_dev', v_seq)
myFile:close()