In [309]:
import os
import sys
import time
import timeit
import numpy as np
import theano
import theano.tensor as T
import cPickle
import copy
from munkres import Munkres, print_matrix

In [2]:
data_path = "/home/guillaume/Documents/CMU/cours/MT/git/sp2015.11-731/hw2/data"
embeddings_location_path = "/home/guillaume/Documents/CMU/project/datasets/vectors/"
train_gold_path = os.path.join(data_path, "train.gold")
tok_sentences_path = os.path.join(data_path, "train-test.hyp1-hyp2-ref.tok")

In [3]:
train_gold = np.array([[int(x) for x in line.strip().split()] for line in open(train_gold_path, 'r')])
print "%i lines read from train gold file." % len(train_gold)

26208 lines read from train gold file.


In [4]:
sentences_tuples = [[s.split() for s in line.strip().lower().split(" &#124; &#124; &#124; ")] for line in open(tok_sentences_path, 'r')]
for s in sentences_tuples:
    assert len(s) == 3
print "%i lines read from tokenized sentences file." % len(sentences_tuples)

50339 lines read from tokenized sentences file.


In [5]:
vocabulary = set()
for s in sentences_tuples:
    for sub_sentence in s:
        for token in sub_sentence:
            vocabulary.add(token)
print "%i words in vocabulary." % len(vocabulary)

17344 words in vocabulary.


In [82]:
def load_embeddings(filename):
    embeddings = {}
    token_not_in_index = 0
    split_size = None
    for line in open(os.path.join(embeddings_location_path, filename), 'r'):
        temp = line.rstrip().split(" ")
        if split_size == None:
            split_size = len(temp)
        else:
            assert len(temp) == split_size
        token = temp[0].lower()
        if token not in vocabulary and token not in ["uuunkkk", "*unknown*"]:
            token_not_in_index += 1
            continue
        embeddings[token] = np.asarray([float(x) for x in temp[1:]]).astype(np.float32)
    print '%i / %i words embeddings of dimension %i loaded from "%s". %i were not in the vocabulary.' % \
        (len(embeddings), len(vocabulary), split_size - 1, filename, token_not_in_index)
    return embeddings

In [8]:
def cosine(a, b):
    assert a.shape == b.shape
    return np.sum(a*b) / (np.linalg.norm(a) * np.linalg.norm(b))

def w_cosine(a, b):
    return cosine(embeddings[a], embeddings[b])

In [85]:
penalty_coeff = 5
def tokens_to_matrix(sentence1, sentence2):
    sentence1 = [w for w in sentence1 if w in embeddings]
    sentence2 = [w for w in sentence2 if w in embeddings]
    #sentence1 = [w if w in embeddings else "*unknown*" for w in sentence1]
    #sentence2 = [w if w in embeddings else "*unknown*" for w in sentence2]
    n = len(sentence1)
    m = len(sentence2)
    matrix = np.zeros((n, m))
    for i in xrange(n):
        for j in xrange(m):
            matrix[i][j] = w_cosine(sentence1[i], sentence2[j])
            matrix[i][j] *= np.exp(-penalty_coeff * abs(float(i)/n - float(j)/m))
    return matrix

In [12]:
def paraphrase_matrix(i):
    [is_paraphrase, sentence1, sentence2] = train_lines[i]
    return tokens_to_matrix(sentence1, sentence2)

In [13]:
def list_to_matrix(l):
    return np.asarray(l)

def matrix_to_list(m):
    return [list(x) for x in m]

def pad_matrix(matrix):
    [n, m] = matrix.shape
    if n == m:
        return matrix
    new_matrix = np.zeros((max(n, m), max(n, m)))
    if n > m:
        new_matrix[:, :m] = matrix
    else:
        new_matrix[:n, :] = matrix
    return new_matrix

In [14]:
factor = 1000000
def best_matching(matrix):
    original = matrix.copy()
    # normalizing
    mini = np.min(matrix)
    if mini < 0:
        matrix = matrix - 2 * mini
    else:
        mini = 0
    matrix = (matrix * factor).astype(np.int32)
    # convert matrix into list of list
    matrix = matrix_to_list(matrix)
    # we want the max sum, not min
    cost_matrix = []
    for row in matrix:
        cost_row = []
        for col in row:
            cost_row += [sys.maxsize - col]
        cost_matrix += [cost_row]
    # run algorithm
    m = Munkres()
    indexes = m.compute(cost_matrix)
    #print_matrix(matrix, msg='Highest profit through this matrix:')
    total = 0.0
    for row, column in indexes:
        value = original[row][column]
        total += value
        #print '(%d, %d) -> %f' % (row, column, value)
    # return result
    #print 'total profit=%d' % total
    return total

In [15]:
def similarity_score(sentence1, sentence2):
    matrix = tokens_to_matrix(sentence1, sentence2)
    return best_matching(matrix) * 2 / (len(sentence1) + len(sentence2))

In [182]:
def compute_accuracy(scores):
    count_correct = 0
    for gold_score, score in zip(train_gold, scores):
        count_correct += gold_score == score
    accuracy = 1. * count_correct / min(len(train_gold), len(scores))
    return accuracy

In [99]:
def compute_scores(how=None):
    scores = []
    if how == None:
        how = len(sentences_tuples)
    for i in xrange(how):
        hyp1, hyp2, ref = sentences_tuples[i]
        new_score = 0
        score1 = similarity_score(hyp1, ref)
        score2 = similarity_score(hyp2, ref)
        if score1 > score2:
            new_score = -1
        elif score1 < score2:
            new_score = 1
        #if new_score == 0:
        #    print i
        if i % 50 == 0 and i != 0:
            print ".",
        if i == 2000:
            break
        scores.append(new_score)
    return scores

In [116]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = 8

In [240]:
def compute_score_sentence_parallel(i):
    hyp1, hyp2, ref = sentences_tuples[i]
    try:
        score1 = similarity_score(hyp1, ref)
    except:
        score1 = 0
    try:
        score2 = similarity_score(hyp2, ref)
    except:
        score2 = 0
    return (score1, score2)

In [241]:
#new_tests = {"glove.6B.300d.txt":[3], "google_300.txt":[2, 3], "huang_2012_vectors.txt":[3, 4], "levy-2014.txt":[4], "data8_A_100038words_1000vectors.txt":[4]}
#new_tests = {"glove.6B.300d.txt":[3], "huang_2012_vectors.txt":[3, 4], "data8_A_100038words_1000vectors.txt":[4]}
new_tests = {"google_300.txt":[0.75]}

In [242]:
for vectors_file, liste in new_tests.items():
    embeddings = load_embeddings(vectors_file)
    for penalty_coeff in liste:
        scores = Parallel(n_jobs=num_cores)(delayed(compute_score_sentence_parallel)(i) for i in xrange(len(sentences_tuples)))
    print ""

12591 / 17344 words embeddings of dimension 300 loaded from "google_300.txt". 65862 were not in the vocabulary.
0.75 [ 0.          0.00011447]



In [183]:
compute_accuracy(scores)

array([ 0.54620726])

In [190]:
cPickle.dump(scores, open("/home/guillaume/Documents/CMU/cours/MT/git/sp2015.11-731/hw2/scores.pkl", "w"))

In [226]:
import kenlm
model = kenlm.LanguageModel('/home/guillaume/Documents/CMU/project/datasets/rcv1.tok.1M.model')

In [271]:
hyp1_scores = []
hyp2_scores = []
for i in xrange(len(sentences_tuples)):
    hyp1, hyp2, ref = sentences_tuples[i]
    hyp1_scores.append(model.score(" ".join(hyp1)))
    hyp2_scores.append(model.score(" ".join(hyp2)))

In [272]:
language_model_scores = [(a, b) for a, b in zip(np.array(hyp1_scores), np.array(hyp2_scores))]

In [335]:
def merge_scores(matching_scores, lm_scores, alpha):
    final_scores = []
    for i in xrange(len(sentences_tuples)):
        score1 = (1 - alpha) * (matching_scores[i][0]) + alpha * lm_scores[i][0]
        score2 = (1 - alpha) * (matching_scores[i][1]) + alpha * lm_scores[i][1]
        if score1 > score2:
            new_score = -1
        elif score2 > score1:
            new_score = 1
        else:
            new_score = 0
        final_scores.append(new_score)
    return final_scores

In [336]:
def combine_scores(matching_scores, lm_scores, nb_attempts):
    for alpha in np.arange(0, 0.00001, 0.00001 / nb_attempts):
        final_scores = merge_scores(matching_scores, lm_scores, alpha)
        print alpha, compute_accuracy(final_scores)

In [338]:
combine_scores(scores, language_model_scores, 100)

0.0 [ 0.54677961]
1e-07 [ 0.54716117]
2e-07 [ 0.54716117]
3e-07 [ 0.54716117]
4e-07 [ 0.54716117]
5e-07 [ 0.54716117]
6e-07 [ 0.54716117]
7e-07 [ 0.54716117]
8e-07 [ 0.54716117]
9e-07 [ 0.54716117]
1e-06 [ 0.54716117]
1.1e-06 [ 0.54716117]
1.2e-06 [ 0.54716117]
1.3e-06 [ 0.54716117]
1.4e-06 [ 0.54716117]
1.5e-06 [ 0.54716117]
1.6e-06 [ 0.54719933]
1.7e-06 [ 0.54719933]
1.8e-06 [ 0.54719933]
1.9e-06 [ 0.54719933]
2e-06 [ 0.54719933]
2.1e-06 [ 0.54719933]
2.2e-06 [ 0.54719933]
2.3e-06 [ 0.54719933]
2.4e-06 [ 0.54719933]
2.5e-06 [ 0.54719933]
2.6e-06 [ 0.54723748]
2.7e-06 [ 0.54723748]
2.8e-06 [ 0.54723748]
2.9e-06 [ 0.54723748]
3e-06 [ 0.54719933]
3.1e-06 [ 0.54719933]
3.2e-06 [ 0.54719933]
3.3e-06 [ 0.54719933]
3.4e-06 [ 0.54719933]
3.5e-06 [ 0.54719933]
3.6e-06 [ 0.54719933]
3.7e-06 [ 0.54719933]
3.8e-06 [ 0.54719933]
3.9e-06 [ 0.54719933]
4e-06 [ 0.54719933]
4.1e-06 [ 0.54719933]
4.2e-06 [ 0.54719933]
4.3e-06 [ 0.54719933]
4.4e-06 [ 0.54719933]
4.5e-06 [ 0.54719933]
4.6e-06 [ 0.547199

In [339]:
final_scores = merge_scores(scores, language_model_scores, 2.8e-06)

In [341]:
with open("/home/guillaume/Documents/CMU/cours/MT/git/sp2015.11-731/hw2/output.txt", "w") as f:
    f.write("\n".join([str(x) for x in final_scores]))