In [1]:
import numpy as np
import argparse
import sys
import pickle
import h5py
import itertools

In [2]:
from helper import *

In [3]:
import preprocess

In [201]:
with h5py.File('../Data/preprocess/task2_train.hdf5','r') as hf:
    ans = np.array(hf.get('answers'))
    questions = np.array(hf.get('questions'))
    questions_sentences = np.array(hf.get('questions_sentences'))
    sentences = np.array(hf.get('sentences'))

In [251]:
def sentence_relevance(story, ques, n = 1, stopwords = None):

    nsentence, nword =  story.shape
    
    relevance = np.zeros(nsentence)
    for i in range(nsentence):
        for j in range(nword):
            for w in ques:
                if story[i,j] == w and w not in stopwords:
                    relevance[i] += 1
                    
    if n == 1:
        return np.argmax(relevance
    else:
        res = list(np.arange(nsentence)[relevance.nonzero()]+1)
        for i in list(np.arange(nsentence)[relevance.nonzero()]):
            for ii in np.arange(nsentence):
                if ii+1 in res:
                    continue
                else:
                    count = 0
                    for j in range(nword):
                        for w in story[i,:]:
                            if story[ii,j] == w and w not in stopwords:
                                count += 1
                    if count > 0:
                        res.append(ii+1)
                            
        return sorted(res)

In [247]:
def train_question_vector(questions, answers, aw_number, alpha=0.1):
    '''
    Build embeddings of the first word of the question.
    Output: dictionnary {q[0]: embeddings}
    '''
    # Build the embeddings
    questions_embeddings = {}
    for q, r in zip(questions, answers):
        # Index starts at 1
        if q[0] not in questions_embeddings:
            questions_embeddings[q[0]] = alpha*np.ones(aw_number)
        questions_embeddings[q[0]][r[0]-1] += 1

    # Normalize
    for k in questions_embeddings.keys():
        questions_embeddings[k] /= np.sum(questions_embeddings[k])

    return questions_embeddings


def build_story_aw_distribution(facts, aw_number, alpha=0.1, decay=0.1):
    '''
    Compute the count of answer words in the fact. Weight down the
    old words.
    Output: normalized count vector
    '''
    count_vector = alpha*np.ones(aw_number)
    bow = facts.flatten()
    for i in xrange(len(bow)-1, -1, -1):
        b = bow[i]
        # check not padding and an answer word
        if b != 0 and b <= aw_number:
            # weighted coung
            count_vector[b-1] += 1 + decay*i
    # Normalization
    count_vector /= np.sum(count_vector)

    return count_vector


def question_matching_feature(facts, question, aw_number, alpha=0.1):
    '''
    Indicator feature on the set of possible answer words to indicate
    if a question word is inside the same sentence as an answer word
    in the facts.
    Output: normalized count vector.
    '''
    count_vector = alpha*np.ones(aw_number)
    question_set = set([q for q in question if q != 0])
    for fact in facts:
        fact_set = set([q for q in fact if q != 0])
        intersection = question_set.intersection(fact_set)
        for i in intersection:
            pass

In [234]:
sentences, questions, questions_sentences, answers = read_preprocessed_matrix_data('task2_train')
word2index, index2sentence, index2question = read_preprocessed_mapping('task2_train')

In [235]:
questions_embeddings = train_question_vector(questions, answers, 6,
                                                 alpha=0.1)

In [241]:
index2word = {}
for k,v in word2index.iteritems():
    index2word[v] = k

In [245]:
index2word[1]

'garden'

In [255]:
ct = 0
for nq in range(1000):
    story_1 = sentences[(questions_sentences[nq,0]-1):questions_sentences[nq,1],:]
    ques = questions[nq]
    facts_ = sentences[questions_sentences[nq][0]-1: questions_sentences[nq][1], :]
    facts = facts_[np.array(sentence_relevance(facts_, questions[nq], 2, [0, 20, 90]),dtype = int)-1,:]
    f2 = np.log(build_story_aw_distribution(facts, 6, decay = 0))
    f1 = np.log(questions_embeddings[questions[nq][0]])
    r = np.exp(f1+f2)
    r /= np.sum(r)
    if answers.flatten()[nq] == (np.argmax(r)+1):
        ct+=1
print float(ct)/1000

0.594


In [227]:
nq = 0
story_1 = sentences[(questions_sentences[nq,0]-1):questions_sentences[nq,1],:]
ques = questions[nq]

In [229]:
print "Relevant sentences: "
print sentence_relevance(story_1, ques, 2, [0, 20, 90])
print "True relevant sentences: "
print questions_sentences[nq]

Relevant sentences: 
[ 0.  0.  1.  0.  0.  0.]
[1, 3, 5, 6]
True relevant sentences: 
[1 6 3 6 0]


In [171]:
facts_ = sentences[questions_sentences[nq][0]-1: questions_sentences[nq][1], :]

In [198]:
# facts = sentences[sentence_relevance(facts_, questions[nq], 3, [0, 21, 90])-1,:]
facts_[np.array(sentence_relevance(facts_, questions[nq], 3, [0, 21, 90]))-1,:]

array([[14, 19, 17, 16,  4,  0],
       [11, 15, 17, 16,  1,  0],
       [18, 15, 17, 16,  2,  0],
       [11, 15, 17, 16,  6,  0],
       [11, 15, 17, 16,  1,  0],
       [18, 15, 17, 16,  6,  0],
       [18, 19,  9, 17, 16,  4],
       [10, 13, 17, 16,  1,  0]])

In [173]:
f2 = np.log(build_story_aw_distribution(facts, 6, decay = 0))

In [174]:
f2

array([-2.77258872, -2.77258872, -0.37469345, -2.77258872, -2.77258872,
       -2.77258872])

In [175]:
f1 = np.log(questions_embeddings[questions[0][0]])

In [176]:
f1

array([-1.87075336, -1.74299786, -1.79576508, -1.86428503, -1.75448564,
       -1.73164055])

In [177]:
r = np.exp(f1+f2)
print r / np.sum(r)

[ 0.05789751  0.0657875   0.68646679  0.05827322  0.06503607  0.06653892]


In [178]:
print "True: %i" %answers.flatten()[nq]
print "Predicted: %i" % (np.argmax(r)+1)

True: 3
Predicted: 3
