# Translation Models for Question Retrieval

### Loading dependencies

In [3]:
import os
import logging
from gensim import corpora
import pandas as pd
import numpy as np

#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # logging

### Loading Stopwords

In [4]:
path = '/roaming/fkunnema/goeievraag/exp_similarity/stopwords.txt'
with open(path) as f:
    stopwords = [word.lower().strip() for word in f.read().split()]

### Loading Goeievraag corpus

In [5]:
goeievraag_path = '/roaming/fkunnema/goeievraag'  # goeievraag path
# loading tokenized and 'stopworded' questions, and lowercase them.
train_question_path = os.path.join(goeievraag_path, 'exp_similarity', 'train_questions.tok.txt')
with open(train_question_path) as f:
    doc = f.read()

original_train_questions, train_questions = [], []
for question in doc.split('\n')[:-1]:
    original_question = [word for word in question.lower().split()]
    question = [word for word in original_question if word not in stopwords]
    if len(question) > 0:
        original_train_questions.append(original_question)
        train_questions.append(question)
print('Number of train questions: ', str(len(train_questions)))

path = os.path.join(goeievraag_path, 'exp_similarity', 'seed_questions.tok.txt')
with open(path) as f:
    doc = f.read()

original_test_questions, test_questions = [], []
for question in doc.split('\n')[:-1]:
    original_question = [word for word in question.lower().split()]
    question = [word for word in original_question if word not in stopwords]
    if len(question) > 0:
        original_test_questions.append(original_question)
        test_questions.append(question)
print('Number of test questions: ', str(len(test_questions)))

Number of train questions:  497054
Number of test questions:  100


### Computing and saving dictionary

In [6]:
vocabulary = corpora.Dictionary(train_questions)
#vocabulary.save('goeievraag.dict')

### Background language model 
P(w | C)

In [7]:
# question unigrams P(w | C)
tokens = []
for question in train_questions:
    for token in question:
        tokens.append(token)

vocablen = len(vocabulary)
Q_len = float(len(tokens))
aux_w_Q = vocabulary.doc2bow(tokens)
aux_w_Q = dict([(vocabulary[w[0]], (w[1]+1.0)/(Q_len+vocablen)) for w in aux_w_Q])

w_Q = {}
for w in aux_w_Q:
    if w[0] not in w_Q:
        w_Q[w[0]] = {}
    w_Q[w[0]][w] = aux_w_Q[w]

# with open('prob_w_Q.txt', 'w') as f:
#     for w, prob in w_Q:
#         f.write(' '.join([w, str(prob), '\n']))     

In [9]:
w_Q['m']['muis']

0.00014471164341808902

### Lexical probabilities
P(w | t) with indexing

In [10]:
path = '/home/tcastrof/DiscoSumo/translation/model/lex.f2e'
with open(path) as f:
    doc = list(map(lambda x: x.split(), f.read().split('\n')))
    
t2w = {}
for row in doc[:-1]:
    t = row[0]
    if t[0] not in t2w:
        t2w[t[0]] = {}
    if t not in t2w[t[0]]:
        t2w[t[0]][t] = {}
    
    w = row[1]
    if w[0] not in t2w[t[0]][t]:
        t2w[t[0]][t][w[0]] = {}
    
    prob = float(row[2])
    t2w[t[0]][t][w[0]][w] = prob

# Base

In [12]:
class Base():
    def __init__(self, training, prob_w_C, vocablen, alpha):
        self.training = training
        self.prob_w_C = prob_w_C
        self.vocablen = vocablen
        self.alpha = alpha

    def score(self, query, question, w_Cs=[]):
        raise NotImplementedError("Please Implement this method")

    def rank(self, query, n=10):
        ranking = []

        w_Cs = []
        for w in query:
            w = w.lower()
            try:
                w_C = self.prob_w_C[w[0]][w]
            except:
                w_C = 1.0 / self.vocablen
            w_Cs.append(w_C)

        for i, question in enumerate(self.training):
            if i % 1000 == 0:
                print('Question: ', str(i), ' '.join(question), sep="\t")
            prob = self.score(query, question, w_Cs)
            ranking.append((i, prob))

        ranking = sorted(ranking, key=lambda x: x[1], reverse=True)
        return ranking[:n]

# Language Model

In [13]:
class LM(Base):
    def __init__(self, training, prob_w_C, vocablen, alpha):
        Base.__init__(self, training, prob_w_C, vocablen, alpha)

    def score(self, query, question, w_Cs=[]):
        Q = pd.Series(question)
        Q_count = Q.count()
        prob = 0.0
        for i, w in enumerate(query):
            w = w.lower()
            try:
                w_Q = float(Q[Q == w].count()) / Q_count
            except:
                w_Q = 0.0

            if len(w_Cs) == 0:
                try:
                    w_C = self.prob_w_C[w[0]][w]
                except:
                    w_C = 1.0 / self.vocablen
            else:
                w_C = w_Cs[i]

            prob += np.log(((1-self.alpha) * w_Q) + (self.alpha * w_C))
        return prob

In [14]:
training = ['i like bananas', 'i like apples', 'i hate aubergine']
training = map(lambda s: s.split(), training)

prob_w_C = {
    'i': 0.1,
    'like': 0.2,
    'bananas': 0.15,
    'apples': 0.15,
    'hate': 0.2,
    'aubergine': 0.1,
    'pineapple': 0.1
}

alpha = 0.2

lm = LM(training, prob_w_C, len(vocabulary), alpha)
lm.rank('i hate pineapple'.split())

Question: 	0	i like bananas


[(2, -16.3987280303173), (0, -28.83220048974483), (1, -28.83220048974483)]

# Translation Model

In [15]:
class TRM(Base):
    def __init__(self, training, prob_w_C, prob_w_t, vocablen, alpha):
        Base.__init__(self, training, prob_w_C, vocablen, alpha) 
        self.prob_w_t = prob_w_t
        
    def score(self, query, question, w_Cs=[]):
        Q = pd.Series(question)
        Q_count = Q.count()
        
        t_Qs = []
        for t in question:
            t = t.lower()
            t_Q = float(Q[Q == t].count()) / Q_count
            t_Qs.append(t_Q)
        
        prob = 0.0
        for i, w in enumerate(query):
            w = w.lower()
            if len(w_Cs) == 0:
                try:
                    w_C = self.prob_w_C[w[0]][w]
                except:
                    w_C = 1.0 / self.vocablen
            else:
                w_C = w_Cs[i]
            
            w_Q = 0.0
            for j, t in enumerate(question):
                t = t.lower()
                try:
                    w_t = prob_w_t[t[0]][t][w[0]][w]
                except:
                    w_t = 0.0
                
                t_Q = t_Qs[j]
                w_Q += (w_t * t_Q)
            prob += np.log(((1-self.alpha) * w_Q) + (self.alpha * w_C))
        return prob

# Translation-based Language Model

In [19]:
class TRLM(Base):
    def __init__(self, training, prob_w_C, prob_w_t, vocablen, alpha, sigma):
        Base.__init__(self, training, prob_w_C, vocablen, alpha) 
        self.prob_w_t = prob_w_t
        self.sigma = sigma
    
    def score_all(self, query, question, w_Cs=[]):
        Q = pd.Series(question)
        Q_count = Q.count()
        
        t_Qs = []
        for t in question:
            t = t.lower()
            t_Q = float(Q[Q == t].count()) / Q_count
            t_Qs.append(t_Q)
        
        lmprob, trmprob, trlmprob = 0.0, 0.0, 0.0
        for i, w in enumerate(query):
            w = w.lower()
            if len(w_Cs) == 0:
                try:
                    w_C = self.prob_w_C[w[0]][w]
                except:
                    w_C = 1.0 / self.vocablen
            else:
                w_C = w_Cs[i]
            
            ml_w_Q = float(Q[Q == w].count()) / Q_count
            mx_w_Q = 0.0
            for j, t in enumerate(question):
                t = t.lower()
                try:
                    w_t = self.prob_w_t[t[0]][t][w[0]][w]
                except:
                    w_t = 0.0
                print(t, w, str(w_t))
                
                t_Q = t_Qs[j]
                mx_w_Q += (w_t * t_Q)
            w_Q = (self.sigma * mx_w_Q) + ((1-self.sigma) * ml_w_Q)
            lmprob += np.log(((1-self.alpha) * ml_w_Q) + (self.alpha * w_C))
            trmprob += np.log(((1-self.alpha) * mx_w_Q) + (self.alpha * w_C))
            trlmprob += np.log(((1-self.alpha) * w_Q) + (self.alpha * w_C))
        return lmprob, trmprob, trlmprob
    
    def rank_all(self, query, n=10):
        lmrank, trmrank, trlmrank = [], [], []
        score_times = []

        w_Cs = []
        for w in query:
            w = w.lower()
            try:
                w_C = self.prob_w_C[w[0]][w]
            except:
                w_C = 1.0 / self.vocablen
            w_Cs.append(w_C)

        for i, question in enumerate(self.training):
            lmprob, trmprob, trlmprob, score_time  = self.score_all(query, question, w_Cs)
            score_times.append(score_time)
            lmrank.append((i, lmprob))
            trmrank.append((i, trmprob))
            trlmrank.append((i, trlmprob))
            if i % 1000 == 0:
                print('Train question: ', str(i), 'Score time: ', str(np.mean(score_times)), 'Total time: ', str(sum(score_times)), sep="\t")

        lmrank = sorted(lmrank, key=lambda x: x[1], reverse=True)
        trmrank = sorted(trmrank, key=lambda x: x[1], reverse=True)
        trlmrank = sorted(trlmrank, key=lambda x: x[1], reverse=True)
        return lmrank[:n], trmrank[:n], trlmrank[:n]
    
    def score(self, query, question, w_Cs=[]):
        Q = pd.Series(question)
        Q_count = Q.count()
        
        t_Qs = []
        for t in question:
            t = t.lower()
            t_Q = float(Q[Q == t].count()) / Q_count
            t_Qs.append(t_Q)
        
        prob = 0.0
        for i, w in enumerate(query):
            w = w.lower()
            if len(w_Cs) == 0:
                try:
                    w_C = self.prob_w_C[w[0]][w]
                except:
                    w_C = 0.0
            else:
                w_C = w_Cs[i]
            
            ml_w_Q = float(Q[Q == w].count()) / Q_count
            mx_w_Q = 0.0
            for j, t in enumerate(question):
                t = t.lower()
                try:
                    w_t = self.prob_w_t[t[0]][t][w[0]][w]
                except:
                    w_t = 0.0
                
                t_Q = t_Qs[j]
                mx_w_Q += (w_t * t_Q)
            w_Q = (self.sigma * mx_w_Q) + ((1-self.sigma) * ml_w_Q)
            prob += np.log(((1-self.alpha) * w_Q) + (self.alpha * w_C))
        return prob

In [20]:
tq1 = 'wat zijn voortekenen van een psychose'.split()
tq2 = 'wat is een begripsvraag wat is een toepassingsvraag wat is een kennisvraag'.split()
trm = TRLM([tq1], w_Q, t2w, len(vocabulary), 0.5, 0.3)

query = 'wat is een toxische psychose'.split()
print(trm.score_all(query, tq1))
# print(trm.score(query, tq2))
# print(trm.score(tq2, tq1))
# print(trm.score(tq2, tq2))

wat wat 0.0308996
zijn wat 0.006492
voortekenen wat 4.2e-06
van wat 0.011798
een wat 0.1310603
psychose wat 0.0
wat is 0.0016978
zijn is 0.0163889
voortekenen is 0.0
van is 0.0102445
een is 0.0308229
psychose is 0.0
wat een 0.0004342
zijn een 0.0001112
voortekenen een 0.0
van een 0.0372428
een een 0.2483838
psychose een 1.47e-05
wat toxische 0.0
zijn toxische 0.0
voortekenen toxische 0.0
van toxische 0.0
een toxische 0.0
psychose toxische 0.0
wat psychose 0.035533
zijn psychose 0.0
voortekenen psychose 0.0
van psychose 0.0
een psychose 0.0
psychose psychose 0.1167513
(-34.58531746865747, -31.904206279546965, -29.077728157957758)


## Script