In [1]:
import DataImport
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
import numpy as np

In [2]:
# Input: examples, an array of examples, each a dict containing:
# example['s']: an array of sentences corresponding to statements 
# potentially helpful in solving the problem
# example['q']: the question asked
# example['a']: the answer of the question
#
# Output: arrays of strings corresponding to inputs for a bag of 
# words feature extractor. X are statements, Y is answers, and Q are questions.
def make_strings(examples):
    X = []
    Y = []
    Q = []
    for example in examples:
        X.append(" ".join(example['s']))
        Q.append(example['q'])
        Y.append(example['a'])
    print "Examples loaded: "
    print "\t X[1] = %s\n\t Y[1] = %s\n\t Q[1] = %s" % (X[1], Y[1], Q[1]) 
    return (X,Y,Q)

# Runs a given bag of words feature extractor feat_ex on inputs X, Y and Q
# from make_strings. Returns (T, Q), where T is a stack of the supporting
# statement (X) and question (Q) vectors, and Y is a matrix of answer vectors.
def get_features(feat_ex, X, Y, Q):
    X_t = feat_ex.transform(X)
    Q_t = feat_ex.transform(Q)
    Y_t = np.argmax(feat_ex.transform(Y).todense(), axis=1)
    return (hstack([X_t, Q_t]), Y_t)

def fix_directions(examples):
    directions = {'n':'north','e':'east','s':'south','w':'west'}
    for example in examples:
        dirs = example['a'].split(',')
        newdirs = [directions[d] for d in dirs]
        example['a'] = " ".join(newdirs)

In [3]:
datadir = "./data/"
tasknum = 19
if tasknum == 1:
    train_examples = DataImport.getdata(datadir+"qa1_single-supporting-fact_train.txt")
    test_examples = DataImport.getdata(datadir+"qa1_single-supporting-fact_test.txt")
elif tasknum == 5:
    train_examples = DataImport.getdata(datadir+"qa5_three-arg-relations_train.txt")
    test_examples = DataImport.getdata(datadir+"qa5_three-arg-relations_test.txt")
elif tasknum == 7:
    train_examples = DataImport.getdata(datadir+"qa7_counting_train.txt")
    test_examples = DataImport.getdata(datadir+"qa7_counting_test.txt")
elif tasknum == 17:
    train_examples = DataImport.getdata(datadir+"qa17_positional-reasoning_train.txt")
    test_examples = DataImport.getdata(datadir+"qa17_positional-reasoning_test.txt")
elif tasknum == 19:
    train_examples = DataImport.getdata(datadir+"qa19_path-finding_train.txt")
    test_examples = DataImport.getdata(datadir+"qa19_path-finding_test.txt")
    # hack to replace directions with their actual words to fit bag of words model
    fix_directions(train_examples)
    fix_directions(test_examples)

# Create ngram vectorizer and string inputs
vectorizer = CountVectorizer(ngram_range=(1, 4),min_df=1)
(X_tr, Y_tr, Q_tr) = make_strings(train_examples)
(X_te, Y_te, Q_te) = make_strings(test_examples)

# Want the feature space to include the words in the test examples too
feature_extractor = vectorizer.fit(X_tr+X_te+Y_tr+Y_te+Q_tr+Q_te)

Examples loaded: 
	 X[1] = The bedroom is west of the hallway. The office is east of the garden. The garden is north of the kitchen. The kitchen is north of the bathroom. The hallway is west of the garden.
	 Y[1] = north west
	 Q[1] = How do you go from the kitchen to the hallway?
Examples loaded: 
	 X[1] = The hallway is west of the bathroom. The office is south of the bedroom. The garden is north of the bedroom. The kitchen is east of the bedroom. The hallway is east of the kitchen.
	 Y[1] = east east
	 Q[1] = How do you go from the bedroom to the hallway?


In [None]:
# Obtain featurized vector stacks 
(t_train, y_train) = get_features(feature_extractor, X_tr, Y_tr, Q_tr)
print t_train.shape

In [None]:
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
clf.fit(t_train, y_train)
print "Train score: %f" % clf.score(t_train, y_train)

(t_test, y_test) = get_features(feature_extractor, X_te, Y_te, Q_te)
print "Test score: %f" % clf.score(t_test, y_test)