In [3]:
import glob
import os
import cPickle as pickle
from IPython.display import clear_output
import numpy as np

In [4]:
# data path
data_path = '../data/WikiTableQuestions/'
table_path = data_path + 'csv/'
question_path = data_path + 'data/'

# POS tagging
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [5]:
# TOOLS:
# SyntexNet
# How exactly the syntexnet works?
def syntexnet(sentence, tmp_file='/home/yuweilin/tmp/syntaxnet_temp.txt'):
    ''' - sentence: string
        @ return [root, words_info, hierarchy]
            - word_info: (word, 'PRON', 'WP', 'nsubj')
            - hierarchy: the next level
    '''
    
    bashCommand = 'echo \'' + sentence + '\' | ./syntaxnet/demo.test.sh >' + tmp_file
    cwd = os.getcwd()
    os.chdir('/home/yuweilin/models/syntaxnet/')
    os.system(bashCommand)
    os.chdir(cwd)
    with open(tmp_file, 'r') as f:
        lines = f.readlines()[:-1]
        root, word_info, hierarchy = -1, [(None, None, None, None)]*len(lines), [[] for _ in range(len(lines))]
        for line in lines:
            index, word, _, info1, info2, _, parent, info3, _, _ = line.split()
            index, parent = int(index)-1, int(parent)-1
            word_info[index] = (word, info1, info2, info3)
            if parent == -1:
                root = index
            else:
                hierarchy[parent].append(index)
    return [root, word_info, hierarchy]

In [6]:
def clean_text(s):
    return s.replace('\\n', ' ').replace('\n', '')
    
def read_table(filename, path=table_path, delimiter='\t'):
    ''' given the path and file name of certain table, return a dictionary with {head-title: [elements]} 
    '''
    result = {}
    with open(path+filename, 'r') as f:
        heads, head_index = f.readline().split(delimiter), []
        for head in heads:
            head = clean_text(head)
            result[head] = []
            head_index.append(head)
        
        for line in f:
            for head, ele in zip(head_index, line.split(delimiter)):
                ele = clean_text(ele)
                result[head].append(ele)
        
    return result

def read_questions(filename, path=question_path, delimiter='\t'):
    ''' given the path and file name of certain questions 
    '''
    with open(path+filename, 'r') as f:
        f.readline()
        result = []
        for line in f:
            result.append([clean_text(s) for s in line.split(delimiter)])
        return result

# Pre-process all the questions through SyntexNet

In [None]:
filenames = ['training.tsv', 'pristine-seen-tables.tsv', 'pristine-unseen-tables.tsv']
for filename in filenames:
    i, result = 0, []
    sub = read_questions(filename)
    size = len(sub)
    for id, question, table_id, answer in sub:
        i += 1
        if i%10==0: 
            clear_output()
            print 'Processing {0}/{2} question, {1}%'.format(i, 100.0*i/size, size)
        precessed_question = syntexnet(question)
        result.append([id, question, table_id, answer, precessed_question])
    with open('processed_'+filename, 'wb') as f:
        pickle.dump(result, f)

# Load glo-ve word embedding

In [16]:
word_vectors = {}
with open('/home/yuweilin/word-embedding/glove.840B.300d.txt', 'r') as f:
    i = 0
    for line in f:
        i += 1
        if i % 100000==0: 
            clear_output()
            print 'Processing {0}th word'.format(i)
        info = line.split()
        word_vectors[info[0]] = np.array([float(x) for x in info[1:]])

Processing 2100000th word


# Count of direct answers
Find out how many answers are directly provided in the tabel.

In [7]:
# Read all the tables and construct a set of existing words for each table
table_words = {}
for path in glob.glob(table_path+'*'):
    csv_path = path[-7:]
    for f in glob.glob(table_path+csv_path+'/*.tsv'):
        filename = f.split('/')[-1]
        context = 'csv/{0}/{1}csv'.format(csv_path, filename[:-3])
        table = read_table(csv_path + '/' + filename)
        words = set()
        for key, value in table.items():
            words |= set([key]+value)
        table_words[context] = words

In [8]:
# compare the answer of each question with the word set of corresponding table
def statsCountDirect(filename):
    has, hasNot = [], []
    for id, utterance, context, targetValue in read_questions(filename):
        if context not in table_words:
            print('Could not find table {0}'.format(context))
            continue
        if targetValue in table_words[context]:
            has.append(id)
        else:
            hasNot.append(id)
    print filename
    print '  Answer/target of {}% of questions can directly be found in the corresponding table.'.format(
        float(len(has))/(len(has)+len(hasNot))*100)
    print '  # of questions has directed answer = {0}'.format(len(has))
    print '  # of questions has no directed answer = {0}'.format(len(hasNot))
    return (has, hasNot)

In [9]:
_ = statsCountDirect('training.tsv')
_ = statsCountDirect('pristine-seen-tables.tsv')
_ = statsCountDirect('pristine-unseen-tables.tsv')

training.tsv
  Answer/target of 61.9912379876% of questions can directly be found in the corresponding table.
  # of questions has directed answer = 8773
  # of questions has no directed answer = 5379
pristine-seen-tables.tsv
  Answer/target of 61.1817924795% of questions can directly be found in the corresponding table.
  # of questions has directed answer = 2164
  # of questions has no directed answer = 1373
pristine-unseen-tables.tsv
  Answer/target of 61.97053407% of questions can directly be found in the corresponding table.
  # of questions has directed answer = 2692
  # of questions has no directed answer = 1652


# Count of direct answer with exact same 'title' from SyntexNet

In [12]:
# Read all the tables
all_tables = {}
for path in glob.glob(table_path+'*'):
    csv_path = path[-7:]
    for f in glob.glob(table_path+csv_path+'/*.tsv'):
        filename = f.split('/')[-1]
        context = 'csv/{0}/{1}csv'.format(csv_path, filename[:-3])
        table = read_table(csv_path + '/' + filename)
        all_tables[context] = table

In [18]:
data_questions = '../data/questions_syntaxnet/'
# Change it to return three output: w_word, index of w_word, index of keyword
def findTitle(syntaxnet_info):
    root, word_info, hierarchy = syntaxnet_info
    # find the wh- word
    wh_index = -1
    for index, word in enumerate(word_info):
        if word[2][0] == 'W':
            wh_index = index
            break
    if wh_index == -1:
        return None, wh_index, None
    # find the 'closest' noun
    frontiers, visited = [wh_index], set([wh_index])
    while len(frontiers)!=0:
        frontier = frontiers.pop()
        if word_info[frontier][2][0] == 'N':
            return word_info[frontier][0], wh_index, frontier
        # children
        for i in hierarchy[frontier]:
            if i not in visited:
                frontiers.insert(0, i)
                visited.add(i)
        # parent
        for i, h in enumerate(hierarchy):
            if frontier in h:
                if i not in visited:
                    frontiers.insert(0, i)
                    visited.add(i)
                break
    return None, wh_index, None

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / np.sqrt(np.dot(v1,v1)*np.dot(v2,v2))

def findNearest(word, keys):
    if word == None:
        return ''
    if word.lower() not in word_vectors:
        return word
    max_sim, result = 0, ''
    v1 = word_vectors[word.lower()]
    for key in keys:
        v2 = np.zeros(v1.shape)
        v2 += sum([word_vectors[k.lower()] for k in key.split() if k.lower() in word_vectors])
        sim = cosine_similarity(v1, v2)
        max_sim, result = (max_sim, result) if sim <= max_sim else (sim, key)
    return result

In [21]:
def statsCountExactTitle(filename):
    has, hasNot, questions = [], [], None
    
    with open(data_questions+filename, 'rb') as f:
        questions = pickle.load(f)
    for id, utterance, context, targetValue, syntax in questions[5:]:
        title_question, _, _ = findTitle(syntax)
        
        title_table = []
        for title, elements in all_tables[context].items():
            if targetValue in elements:
                title_table += [title]
        nearest_title = findNearest(title_question, all_tables[context].keys())
        if nearest_title in title_table:
            has.append(id)
        else:
            #if len(title_table)!=0:
            #    print 'Question is: ' + utterance
            #    print '\tTitle from question is: {0}'.format(title_question)
            #    print '\tNearest title is: {0}'.format(nearest_title)
            #    print '\tTitles from table are:  {0}'.format(title_table)
            hasNot.append(id)
    print filename
    print '  Answer/target of {}% of questions can directly be found in the corresponding table.'.format(
        float(len(has))/(len(has)+len(hasNot))*100)
    print '  # of questions has directed answer = {0}'.format(len(has))
    print '  # of questions has no directed answer = {0}'.format(len(hasNot))
    return (has, hasNot)

# Find the answer (assume the column is already correct)

In [82]:
# Hsinya's algorithm for finding correct row
def cos_similarity(w1, w2, word_vectors=word_vectors):
    w1, w2 = w1.lower(), w2.lower()
    if w1==w2: return 1.0
    if w1 not in word_vectors or w2 not in word_vectors:
        return 0.0
    v1, v2 = word_vectors[w1], word_vectors[w2]
    return max(0.0, np.dot(v1,v2))/max(0.01, np.sqrt(np.dot(v1,v1)*np.dot(v2,v2)))

def findClue(syntaxnet_info, w_index, title_index):
    root, word_info, hierarchy = syntaxnet_info
    noun_list = []
    # Find all of the nouns in the sentence
    for index, word in enumerate(word_info):
        if word[2][0] == 'N' and index != title_index:
            noun_list.append(word[0])
    return noun_list
    
def findNearestCluePair(clue_list, col_list):
    cos_dist, clue_word, col_word = 0, '', ''
    for clue in clue_list:
        for column in col_list:
            dist_list = [0.0]
            for column_split in column.split():                   
                dist_list.append(cos_similarity(clue, column_split))
            if max(dist_list) > cos_dist:
                cos_dist, clue_word, col_word = max(dist_list), clue, column
    return cos_dist, clue_word, col_word

def findAns(clue, ans_column_word, clue_column_word, ans_table):
    ans_column = ans_table[ans_column_word]
    clue_column = ans_table[clue_column_word]
    max_similar, ans_index = 0, 0
    for index, clue_cell in enumerate(clue_column):
        cos_dist, _, cell_word = findNearestCluePair([clue], clue_cell.split())
        if cos_dist > max_similar and index < len(ans_column):
            max_similar, ans_index = cos_dist, index
    return ans_column[ans_index]

In [83]:
def statsCountExactCell(filename):
    # 3 categories, right answer, wrong answer, wrong column
    right, wrong, wrongColumn, questions = [], [], [], None   
    with open(data_questions+filename, 'rb') as f:
        questions = pickle.load(f)
    for id, utterance, context, targetValue, syntax in questions[5:]:
        # Find key word for what the question is asking for (column)
        title_question, wh_index, title_index= findTitle(syntax)        
        title_table = []
        for title, elements in all_tables[context].items():
            if targetValue in elements:
                title_table += [title]        
        nearest_title = findNearest(title_question, all_tables[context].keys())
        if nearest_title in title_table:
            clue_words = findClue(syntax, wh_index, title_index)
            table_cols = all_tables[context].keys()
            _, clue, clue_column = findNearestCluePair(clue_words, table_cols)
            if len(clue_words) == 0 or clue == '' or clue_column == '':
                wrong.append(id)
            else:
                answer = findAns(clue, nearest_title, clue_column, all_tables[context])
                if targetValue == answer:
                    right.append(id)
                else:
                    wrong.append(id)
        else:
            wrongColumn.append(id)
    print filename
    print '  {}% of questions with correct column have correct answer.'.format(float(len(right))/(len(right)+len(wrong))*100)
    print '  {}% of questions have correct column.'.format(float(len(right)+len(wrong))/(len(right)+len(wrong)+len(wrongColumn))*100)  
    print '  {}% of questions have correct answer.'.format(float(len(right))/(len(right)+len(wrong)+len(wrongColumn))*100)
    print '  Right answer: {}'.format(len(right))
    print '  Wrong answer: {}'.format(len(wrong))
    print '  Not the correct column: {}'.format(len(wrongColumn))
    return (right, wrong, wrongColumn)                                                         

In [84]:
_ = statsCountExactCell('training.pkl')
_ = statsCountExactCell('seen-tables.pkl')
_ = statsCountExactCell('unseen-tables.pkl')



training.pkl
  12.5868725869% of questions with correct column have correct answer.
  27.4616526472% of questions have correct column.
  3.45656322895% of questions have correct answer.
  Right answer: 489
  Wrong answer: 3396
  Not the correct column: 10262
seen-tables.pkl
  11.9072708114% of questions with correct column have correct answer.
  26.8686296716% of questions have correct column.
  3.1993204983% of questions have correct answer.
  Right answer: 113
  Wrong answer: 836
  Not the correct column: 2583
unseen-tables.pkl
  14.9026248942% of questions with correct column have correct answer.
  27.2182530537% of questions have correct column.
  4.05623415534% of questions have correct answer.
  Right answer: 176
  Wrong answer: 1005
  Not the correct column: 3158


In [None]:
def findAnswerAlan(filename):
    has, hasNot, questions = [], [], None
    
    with open(data_questions+filename, 'rb') as f:
        questions = pickle.load(f)
    for id, utterance, context, targetValue, syntax in questions[5:]:
        title_question = findTitle(syntax)
        
        title_table = []
        for title, elements in all_tables[context].items():
            if targetValue in elements:
                title_table += [title.lower()]
        
        nearest_title = findNearest(title_question, all_tables[context].keys())
        if nearest_title in title_table:
            # 1. syntax: root, word_info, hierarchy
            # 2. title_table
            # 3. nearest_title: the right column
            # 4. original table: all_tables[context] --> need context
            # 5. Goal: find the right row
            # 6. Goal: output the answer
            continue
        else:
            continue

In [1]:
_ = statsCountExactTitle('training.pkl')
_ = statsCountExactTitle('seen-tables.pkl')
_ = statsCountExactTitle('unseen-tables.pkl')

NameError: name 'statsCountExactTitle' is not defined

# Load SyntexNet parsed questions

In [18]:
with open('../data/questions_syntaxnet/seen-tables.pkl', 'rb') as f:
    data = pickle.load(f)

# Usage of Word Embedding 

In [24]:
v1 = word_vectors['cat']
v2 = word_vectors['earth']
v3 = word_vectors['dog']
def cosine(v1, v2):
    return np.dot(v1, v2)/np.sqrt(np.dot(v1, v1)*np.dot(v2, v2))
def distanceInverse(v1,v2):
    return 1.0/np.sqrt(np.sum([x**2 for x in v1-v2]))

print 'v1, v2:', cosine(v1, v2), distanceInverse(v1,v2)
print 'v1, v3:', cosine(v1, v3), distanceInverse(v1,v3)

v1, v2: 0.242447507192 0.122884366123
v1, v3: 0.801685509133 0.230863920704
