# Translation Models for Question Retrieval

### Loading dependencies

In [3]:
import os
import logging
from gensim import corpora
import collections
import pandas as pd
import numpy as np

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # logging

### Loading Goeievraag corpus

In [4]:
goeievraag_path = '/roaming/fkunnema/goeievraag' # goeievraag path
# loading tokenized and 'stopworded' questions, and lowercase them.
train_question_path = os.path.join(goeievraag_path, 'exp_similarity', 'unseeded_questions.tok.txt')
with open(train_question_path) as f:
    doc = f.read()

questions = [question.lower().split() for question in doc.split('\n')[:-1]]
print('Number of train questions: ', str(len(questions)))

train_answer_path = os.path.join(goeievraag_path, 'exp_similarity', 'unseeded_answers.tok.txt')
with open(train_answer_path) as f:
    doc = f.read()

answers = [question.lower().split() for question in doc.split('\n')[:-1]]
print('Number of train answers: ', str(len(answers)))

Number of train questions:  509605
Number of train answers:  509605


### Computing and saving dictionary

In [5]:
dictionary = corpora.Dictionary(questions)
#dictionary.save('goeievraag.dict')

2018-08-01 15:18:06,824 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-08-01 15:18:07,116 : INFO : adding document #10000 to Dictionary(16241 unique tokens: ['geschatte', 'verkochte', 'yong', 'geitenkaas', 'bloemenwater']...)
2018-08-01 15:18:07,426 : INFO : adding document #20000 to Dictionary(25276 unique tokens: ['verkochte', 'yong', 'ml', 'gv-accounts', 'bloemenwater']...)
2018-08-01 15:18:07,743 : INFO : adding document #30000 to Dictionary(32844 unique tokens: ['verkochte', 'yong', 'ml', 'gv-accounts', 'bloemenwater']...)
2018-08-01 15:18:08,031 : INFO : adding document #40000 to Dictionary(39080 unique tokens: ['verkochte', 'yong', 'ml', 'gv-accounts', 'bloemenwater']...)
2018-08-01 15:18:08,317 : INFO : adding document #50000 to Dictionary(44782 unique tokens: ['ml', 'smiling', 'names', 'mbo', 'inrit']...)
2018-08-01 15:18:08,599 : INFO : adding document #60000 to Dictionary(50108 unique tokens: ['ml', 'smiling', 'names', 'mbo', 'inrit']...)
2018-08-01 15:1

In [5]:
# question unigrams P(w | C)
tokens = []
for question in questions:
    for token in question:
        tokens.append(token)

Q_len = float(len(tokens))
w_Q = dictionary.doc2bow(tokens)
w_Q = [(dictionary[w[0]], w[1]/Q_len) for w in w_Q]

with open('prob_w_Q.txt', 'w') as f:
    for w, prob in w_Q:
        f.write(' '.join([w, str(prob), '\n']))     

# Base

In [12]:
class Base():
    def __init__(self, training, prob_w_C, alpha):
        self.training = training
        self.prob_w_C = prob_w_C
        self.alpha = alpha
    
    def score(self, query, question):
        raise NotImplementedError("Please Implement this method")
    
    def rank(self, query, n=10):
        ranking = []
        for question in self.training:
            prob = self.score(query, question)
            ranking.append((question, prob))
        
        ranking = sorted(ranking, key=lambda x: x[0], reverse=True)
        return ranking[:n]

# Language Model

In [13]:
class LM(Base):
    def __init__(self, training, prob_w_C, alpha):
        Base.__init__(self, training, prob_w_C, alpha) 
        
    def score(self, query, question):
        Q = pd.Series(question)
        prob = 0.0
        for w in query:
            w_Q = float(Q[Q == w].count()) / Q.count()

            try:
                w_C = self.prob_w_C[w.lower()]
            except:
                w_C = 0.0
            
            prob += np.log(((1-alpha) * w_Q) + (alpha * w_C))
        return prob

In [14]:
training = ['i like bananas', 'i like apples', 'i hate aubergine']
training = map(lambda s: s.split(), training)

prob_w_C = {
    'i': 0.1,
    'like': 0.2,
    'bananas': 0.15,
    'apples': 0.15,
    'hate': 0.2,
    'aubergine': 0.1,
    'pineapple': 0.1
}

alpha = 0.2

lm = LM(training, prob_w_C, alpha)
lm.rank('i hate pineapple'.split())

[(['i', 'like', 'bananas'], -8.38033400869904),
 (['i', 'like', 'apples'], -8.38033400869904),
 (['i', 'hate', 'aubergine'], -6.343452081438)]

# Translation Model

In [10]:
class TM(Base):
    def __init(self, training, prob_w_C, prob_w_t, alpha):
        Base.__init__(self, training, prob_w_C, alpha) 
        self.prob_w_t = prob_w_t
        
    def score(query, question):
        Q = pd.Series(question)
        prob = 0.0
        for w in query:
            try:
                w_C = self.prob_w_C[w.lower()]
            except:
                w_C = 0.0
            
            w_Q = 0.0
            for t in question:
                try:
                    w_t = prob_w_t[w][t]
                except:
                    w_t = 0.0
                
                t_Q = float(Q[Q == t].count()) / Q.count()
                w_Q += (w_t * t_Q)
            prob += np.log(((1-alpha) * w_Q) + (alpha * w_C))
        return prob

# Translation-based Language Model

In [15]:
class TRLM(Base):
    def __init__(self, training, prob_w_C, prob_w_t, alpha, sigma):
        Base.__init__(self, training, prob_w_C, alpha) 
        self.prob_w_t = prob_w_t
        self.sigma = sigma
    
    def score(self, query, question):
        Q = pd.Series(question)
        prob = 0.0
        for w in query:
            try:
                w_C = self.prob_w_C[w.lower()]
            except:
                w_C = 0.0
            
            ml_w_Q = float(Q[Q == w].count()) / Q.count()
            mx_w_Q = 0.0
            for t in question:
                try:
                    w_t = prob_w_t[w][t]
                except:
                    w_t = 0.0
                
                t_Q = float(Q[Q == t].count()) / Q.count()
                mx_w_Q += (w_t * t_Q)
            w_Q = (sigma * mx_w_Q) + ((1-sigma) * ml_w_Q)
            prob += np.log(((1-alpha) * w_Q) + (alpha * w_C))
        return prob