In [11]:
!export PYTHONPATH=/home/manishav/addons/site-packages/:/home/manishav/

In [12]:
import matchzoo as mz
import numpy as np
import matplotlib.pyplot as plt
import random

##### Fetch the dataset

In [13]:
#!wget https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv

In [14]:
#!wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz

In [15]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [16]:
##### Load the embeddings
#!unzip glove.6B.zip

In [17]:
import matchzoo as mz
import pandas as pd

from sklearn.utils import shuffle
print(mz.__version__)
from scipy.spatial.distance import cosine
from matchzoo.metrics import Precision


2.2.0


##### Define functions and metrics


In [18]:
"""Precision for ranking."""
import numpy as np

from matchzoo.engine.base_metric import BaseMetric, sort_and_couple


### Add 

class PrecisionCheck(BaseMetric):
    """Precision metric."""

    ALIAS = 'precision'

    def __init__(self, k: int = 1, threshold: float = 0.):
        """
        :class:`PrecisionMetric` constructor.
        :param k: Number of results to consider.
        :param threshold: the label threshold of relevance degree.
        """
        self._k = k
        self._threshold = threshold

    def __repr__(self) -> str:
        """:return: Formated string representation of the metric."""
        return f"{self.ALIAS}@{self._k}({self._threshold})"

    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
        #print('number of results', y_true.shape)
        """
        Calculate precision@k.
        Example:
            >>> y_true = [0, 0, 0, 1]
            >>> y_pred = [0.2, 0.4, 0.3, 0.1]
            >>> Precision(k=1)(y_true, y_pred)
            0.0
            >>> Precision(k=2)(y_true, y_pred)
            0.0
            >>> Precision(k=4)(y_true, y_pred)
            0.25
            >>> Precision(k=5)(y_true, y_pred)
            0.2
        :param y_true: The ground true label of each document.
        :param y_pred: The predicted scores of each document.
        :return: Precision @ k
        :raises: ValueError: len(r) must be >= k.
        """
        if self._k <= 0:
            raise ValueError(f"k must be greater than 0."
                             f"{self._k} received.")
        # sorted list of pairs.
        coupled_pair = sort_and_couple(y_true, y_pred)
        
        precision = 0.0
        
        for idx, (label, score) in enumerate(coupled_pair):
            
            #if idx < 10:
            #    print('[',label, score, ']', )
            if idx >= self._k:
                break
            if label > self._threshold:
                precision += 1.
        return precision / self._k
    
    
"""Recall for ranking."""
import numpy as np

from matchzoo.engine.base_metric import BaseMetric, sort_and_couple


class RecallCheck(BaseMetric):
    """Recall metric."""

    ALIAS = 'recall'

    def __init__(self, k: int = 1, threshold: float = 0.):
        """
        :class:`RecallMetric` constructor.
        :param k: Number of results to consider.
        :param threshold: the label threshold of relevance degree.
        """
        self._k = k
        self._threshold = threshold

    def __repr__(self) -> str:
        """:return: Formated string representation of the metric."""
        return f"{self.ALIAS}@{self._k}({self._threshold})"

    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
        if self._k == 5:
            print('number of results', y_true.shape, (y_true > self._threshold ).sum())
        """
        Calculate Recall@k.
        Example:
            >>> y_true = [0, 0, 0, 1]
            >>> y_pred = [0.2, 0.4, 0.3, 0.1]
            >>> Recall(k=1)(y_true, y_pred)
            0.0
            >>> Recall(k=2)(y_true, y_pred)
            0.0
            >>> Recall(k=4)(y_true, y_pred)
            1.0
            >>> Recall(k=5)(y_true, y_pred)
            1.0
        :param y_true: The ground true label of each document.
        :param y_pred: The predicted scores of each document.
        :return: Recall @ k
        :raises: ValueError: len(r) must be >= k.
        """
        if self._k <= 0:
            raise ValueError(f"k must be greater than 0."
                             f"{self._k} received.")
        # sorted list of pairs.
        coupled_pair = sort_and_couple(y_true, y_pred)
        
        recall = 0.0
        rel_docs = (y_true > self._threshold ).sum() * 1.0
        for idx, (label, score) in enumerate(coupled_pair):
            
            if idx < 5 and self._k == 5:
                print('[',label, score, ']', )
            if idx >= self._k:
                break
            if label > self._threshold:
                recall += 1.
        return recall / rel_docs

In [19]:

"""Matchzoo toolkit for token embedding."""

import csv
import typing

import numpy as np
import pandas as pd


class Embedding(object):
    """
    Embedding class.
    Examples::
        >>> import matchzoo as mz
        >>> train_raw = mz.datasets.toy.load_data()
        >>> pp = mz.preprocessors.NaivePreprocessor()
        >>> train = pp.fit_transform(train_raw, verbose=0)
        >>> vocab_unit = mz.build_vocab_unit(train, verbose=0)
        >>> term_index = vocab_unit.state['term_index']
        >>> embed_path = mz.datasets.embeddings.EMBED_RANK
    To load from a file:
        >>> embedding = mz.embedding.load_from_file(embed_path)
        >>> matrix = embedding.build_matrix(term_index)
        >>> matrix.shape[0] == len(term_index) + 1
        True
    To build your own:
        >>> data = pd.DataFrame(data=[[0, 1], [2, 3]], index=['A', 'B'])
        >>> embedding = mz.Embedding(data)
        >>> matrix = embedding.build_matrix({'A': 2, 'B': 1})
        >>> matrix.shape == (3, 2)
        True
    """

    def __init__(self, data: pd.DataFrame):
        """
        Embedding.
        :param data: DataFrame to use as term to vector mapping.
        """
        self._data = data

    @property
    def input_dim(self) -> int:
        """:return Embedding input dimension."""
        return self._data.shape[0]

    @property
    def output_dim(self) -> int:
        """:return Embedding output dimension."""
        return self._data.shape[1]

    def build_matrix(
        self,
        term_index: typing.Union[
            dict, mz.preprocessors.units.Vocabulary.TermIndex],
        initializer=lambda: np.random.uniform(-0.2, 0.2)
    ) -> np.ndarray:
        """
        Build a matrix using `term_index`.
        :param term_index: A `dict` or `TermIndex` to build with.
        :param initializer: A callable that returns a default value for missing
            terms in data. (default: a random uniform distribution in range)
            `(-0.2, 0.2)`).
        :return: A matrix.
        """
        input_dim = len(term_index) + 1
        
        print('Embedding to matrix, input & output', input_dim, self.output_dim)
        
        matrix = np.empty((input_dim, self.output_dim))
        for index in np.ndindex(*matrix.shape):
            matrix[index] = initializer()

        #valid_keys = set(self._data.index)
        terms_to_find = term_index.keys()
        valid_data_frame = self._data[self._data.index[-len(self._data):].isin(terms_to_find)]
        
        nf_count = 0
        total = 0
        
        for term, values in valid_data_frame.iterrows():
            if total % 50000==0:
                print('Words completed', total)
            total +=1
                
            try:
                matrix[term_index[term]] = values
            except Exception as ex:
                nf_count+=1
                print('Word not found', term)
                    
        print('Words not found in embedding ', nf_count)
        
        return matrix


def load_from_file(file_path: str, mode: str = 'word2vec') -> Embedding:
    """
    Load embedding from `file_path`.
    :param file_path: Path to file.
    :param mode: Embedding file format mode, one of 'word2vec' or 'glove'.
        (default: 'word2vec')
    :return: An :class:`matchzoo.embedding.Embedding` instance.
    """
    if mode == 'word2vec':
        data = pd.read_csv(file_path,
                           sep=" ",
                           index_col=0,
                           header=None,
                           skiprows=1)
        print(data.head())
    elif mode == 'glove':
        data = pd.read_csv(file_path,
                           sep=" ",
                           index_col=0,
                           header=None,
                           quoting=csv.QUOTE_NONE)
    else:
        raise TypeError(f"{mode} is not a supported embedding type."
                        f"`word2vec` or `glove` expected.")
    return Embedding(data)





In [20]:
import ast
slist = ast.literal_eval(open('msmarco_data/long_stop_words','r').read())

stop_words = list( set(['are', 'get','you','dont','want','take','have','need','let','your','their','theirs',\
              'still','these','that','could','should', 'would', 'with','does','this','used','make',\
              'makes','made','takes','take','those','when','without','more','becasue','there','aren',\
              'keep','seem','seems','wont','shouldn','shouldnt','only','than','know','every','also',\
              'brand','become','most','other','others','meant','thing','things','happens','anything',\
              'gets','sets','both','bring','then','goes','some','someone','see','article','redirect',\
              'sent','into','about','what','where','give','going','like','look','looks','having','other',\
              'gives','give','given','uses','used','through','though','very','doesn','many','even','mine',\
              'myself','always', 'self','currently','along','else','comes','come','came','likes','like',\
              'because','can','the', 'an', 'to', 'and', 'from', 'for', 'we', 'you', 'i', 'so','such',\
               'a', 'at', 'b', 'be', 'in', 'of', 'on', 'was', 'is','been','while','will','they','them']) | \
                set(slist))
print(len(stop_words))

193


In [21]:
#! tar -xvzf msmarco_data/collectionandqueries.tar.gz

##### Define paths

In [22]:
path='/home/manishav/irexplain/src/main/python/deep_explain/msmarco_data/'
collection_path = path+ 'collection/'
embedding_path =  path+ 'embedding/'

#### Load queries

In [23]:
import re
query_list = []
match_words = r'define|what|when'

for line in open(collection_path+'queries.train.tsv','r'):
    split = line.strip().split('\t')
    
    match = re.search(match_words,split[1])
    if not match:
        query_list.append({'qid':split[0], 'query': split[1].strip()})

In [24]:
query_frame = pd.DataFrame(query_list)

In [25]:
query_frame.head()

Unnamed: 0,qid,query
0,510633,tattoo fixers how much does it cost
1,278900,how many cars enter the la jolla concours d' e...
2,303205,how much can i contribute to nondeductible ira
3,492875,sanitizer temperature
4,54528,blood clots in urine after menopause


In [26]:
query_frame.shape

(478643, 2)

#### load qrels

In [27]:
qrels = []
for line in open(collection_path+'qrels.train.tsv','r'):
    split = line.strip().split('\t')
    qrels.append({'qid':split[0], 'pid': split[2], 'rel':split[3] })
qrel_frame = pd.DataFrame(qrels) 

In [28]:
qrel_frame.head()

Unnamed: 0,qid,pid,rel
0,1185869,0,1
1,1185868,16,1
2,597651,49,1
3,403613,60,1
4,1183785,389,1


In [29]:
qrel_frame['qid'].nunique()

502939

In [30]:
qrel_frame['pid'].nunique()

516472

In [31]:
qrel_frame.shape

(532761, 3)

In [32]:
#! wget https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.tar.gz

##### Load the negative triples.

In [33]:
neg_rows = []
query_count = {}
count = 0
total_queries = 0

test_query ={ }

for line in open('qidpidtriples.train.full.tar','r'):
    split = line.split('\t')
    qid = split[0]
    
    if qid not in query_count:
        query_count[qid] = 0
        total_queries+=1
        if random.random() > 0.999:
            test_query[qid] = 0
        
        # add the query to test with some prob
    
    if qid not in test_query and query_count[qid]< 20:
        neg_rows.append({'qid': qid, 'pid':  split[2].strip(), 'rel':0})
        query_count[qid]+=1
        count+=1 
        
    elif qid in test_query and query_count[qid] < 1000:
        neg_rows.append({'qid': qid, 'pid':  split[2].strip(), 'rel':0})
        query_count[qid]+=1
        test_query[qid] +=1
        count+=1
        
    if count % 5000000 == 0:
        print(count, len(test_query))
        
    #if count > 10000000:
    #    break

1000000 258
2000000 267
3000000 268
4000000 268
5000000 268
6000000 268
6000000 268
6000000 268
6000000 268
6000000 268


In [34]:
total_queries

327721

In [35]:
neg_frame = pd.DataFrame(neg_rows)

In [36]:
qrel_frame = pd.concat([qrel_frame[qrel_frame['qid'].isin(neg_frame['qid'].unique().tolist())], neg_frame])
qrel_frame.drop_duplicates(inplace=True)
qrel_frame.head()

Unnamed: 0,qid,pid,rel
0,1185869,0,1
1,1185868,16,1
3,403613,60,1
7,645590,944,1
9,186154,1160,1


In [37]:
qrel_frame.tail()

Unnamed: 0,qid,pid,rel
6723391,908904,8610436,0
6723392,296496,713532,0
6723393,774829,7112242,0
6723394,840632,7024083,0
6723395,558656,7798884,0


In [38]:
qrel_frame.shape

(7059101, 3)

In [40]:
qid_pid_rel_frame = pd.merge(query_frame, qrel_frame, left_on='qid', right_on='qid', suffixes=('', '_y'))

In [41]:
qid_pid_rel_frame.shape

(3971101, 4)

In [42]:
qid_pid_rel_frame['rel'].value_counts()

0    3775066
1     196035
Name: rel, dtype: int64

In [43]:
qid_pid_rel_frame['rel'] = qid_pid_rel_frame['rel'].apply(int)

In [44]:
qrel_frame[qrel_frame['qid'].isin(query_frame['qid'].tolist())].shape

(3971101, 3)

##### Load documents 

In [45]:
pid_set = set(qid_pid_rel_frame['pid'].tolist())

paragraphs = []
for line in open(collection_path+'collection.tsv','r'):
    split = line.strip().split('\t')
    if split[0] in pid_set:
        paragraphs.append({'pid': split[0],\
                           'paragraph': split[1].strip()})
        
para_frame = pd.DataFrame(paragraphs) 

In [46]:
para_frame.shape

(1638556, 2)

In [47]:
qid_pid_para_rel_frame = pd.merge(qid_pid_rel_frame, para_frame, left_on='pid', right_on='pid', suffixes=('', '_y'))

In [48]:
qid_pid_para_rel_frame.head()

Unnamed: 0,qid,query,pid,rel,paragraph
0,303205,how much can i contribute to nondeductible ira,6487240,1,Nondeductible IRA Contributions. For a traditi...
1,303205,how much can i contribute to nondeductible ira,821461,0,Just because you can contribute to a 401(k) pl...
2,11658,adjusted gross income definition,821461,0,Just because you can contribute to a 401(k) pl...
3,68667,can i contribute to a roth ira without earned ...,821461,0,Just because you can contribute to a 401(k) pl...
4,398833,irs pre-tax traditional ira contribution limits,821461,0,Just because you can contribute to a 401(k) pl...


In [49]:
del(para_frame)

In [50]:
qid_pid_para_rel_frame[qid_pid_para_rel_frame['rel'] == '1'].shape

  result = method(y)


(0, 5)

#### Prepare train and test_frame for the model training

In [52]:
from sklearn.utils import shuffle
import random 
# sample 10K queries
query_list = qid_pid_para_rel_frame['qid'].drop_duplicates().tolist()
print(len(query_list))
test_queries = list(test_query.keys())
train_queries = random.sample(list(set(query_list) - set(test_queries)), 50000)

final_train_frame = shuffle(qid_pid_para_rel_frame[qid_pid_para_rel_frame['qid'].isin(train_queries)][['query','paragraph','rel']])

183200


In [53]:
final_train_frame = final_train_frame.reset_index(drop=True)
final_train_frame.columns=['text_left','text_right','label']

In [54]:
final_train_frame.head()

Unnamed: 0,text_left,text_right,label
0,how many calories are burned walking 10 000 steps,"Calories, Fat, Protein, Fiber, & Carbs In Pane...",0
1,how are northern lights formed chemistry expla...,Fort McMurray can be found on the 56th paralle...,0
2,illinois state revenue tax,Minnesota State Tax Information. Looking for M...,0
3,most likely awards for students,Iggy Azalea Wins Top Rap Artist At The Billboa...,0
4,can you use wax paper instead of parchment,Wash Wax ALL has also been found to be excelle...,0


In [55]:
### sample queries for test (should not overlap with train)
final_test_frame = shuffle(qid_pid_para_rel_frame[qid_pid_para_rel_frame['qid'].isin(test_queries)]\
                            [['query','paragraph','rel']])

In [56]:
final_test_frame = final_test_frame.reset_index(drop=True)
final_test_frame.columns=['text_left','text_right','label']
final_test_frame.head()

Unnamed: 0,text_left,text_right,label
0,who dna is the transforming principle from hea...,Norovirus was once known as Norwalk virus and ...,0
1,who is tim lincecum playing for in 2016,The lineup consisted of Jim (Joe Buck) on voca...,0
2,nc propane cost,Propane-Prices.com was created to help all hom...,0
3,can i use global entry card for tsa precheck,Applicants will log into the account and pay t...,0
4,how long does chicken last after thawed,Directions. 1 Place a Cast Iron Frying Pan in...,0


In [57]:
train_raw =  mz.pack(final_train_frame)# mz.datasets.toy.load_data(stage='train', task=task)
test_raw =  mz.pack(final_test_frame)#mz.datasets.toy.load_data(stage='test', task=task)

In [58]:
train_raw.left.head()

Unnamed: 0_level_0,text_left
id_left,Unnamed: 1_level_1
L-0,how many calories are burned walking 10 000 steps
L-1,how are northern lights formed chemistry expla...
L-2,illinois state revenue tax
L-3,most likely awards for students
L-4,can you use wax paper instead of parchment


#### Preprocess the text

In [59]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=15,
             fixed_length_right=30,
             filter_mode='idf',
             filter_low_freq=2,
             filter_low_freq=2,
             remove_stop_words=True)


In [60]:
preprocessor.fit(train_raw)

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 50000/50000 [00:16<00:00, 3102.60it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval => StopRemoval: 100%|██████████| 703665/703665 [18:08<00:00, 646.50it/s]  
Processing text_right with append: 100%|██████████| 703665/703665 [00:01<00:00, 448330.59it/s]
Building FrequencyFilter from a datapack.: 100%|██████████| 703665/703665 [00:18<00:00, 38972.38it/s]
Processing text_right with transform: 100%|██████████| 703665/703665 [00:23<00:00, 29635.34it/s]
Processing text_left with extend: 100%|██████████| 50000/50000 [00:00<00:00, 427762.36it/s]
Processing text_right with extend: 100%|██████████| 703665/703665 [00:02<00:00, 328270.65it/s]
Building Vocabulary from a datapack.: 100%|██████████| 24042325/24042325 [00:13<00:00, 1810828.14it/s]


<matchzoo.preprocessors.basic_preprocessor.BasicPreprocessor at 0x7fd1e94a5390>

In [61]:
len(preprocessor.context['vocab_unit'].state['term_index'])

582696

In [63]:
train_processed = preprocessor.transform(train_raw,verbose=0)

In [64]:
test_processed = preprocessor.transform(test_raw,verbose=0)

In [65]:
vocab_unit = preprocessor.context['vocab_unit']
print('Orig Text:', train_processed.left.loc['L-0']['text_left'])
sequence = train_processed.left.loc['L-0']['text_left']
print('Transformed Indices:', sequence)
print('Transformed Indices Meaning:',
      '_'.join([vocab_unit.state['index_term'][i] for i in sequence]))

Orig Text: [529130, 546991, 329046, 219779, 207729, 100714, 49393, 0, 0, 0, 0, 0, 0, 0, 0]
Transformed Indices: [529130, 546991, 329046, 219779, 207729, 100714, 49393, 0, 0, 0, 0, 0, 0, 0, 0]
Transformed Indices Meaning: many_calories_burned_walking_10_000_steps_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>


##### Define the ranking task

In [66]:

task = mz.tasks.Ranking()
task.metrics = [PrecisionCheck(k=3, threshold=0),
    PrecisionCheck(k=5, threshold=0),
    RecallCheck(k=3, threshold=0),
    RecallCheck(k=5, threshold=0),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3, threshold=0),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5, threshold=0),
]
print(task)

Ranking Task


In [None]:
! ls {embedding_path}

In [67]:
#train_generator = mz.PairDataGenerator(train_processed, num_dup=1, num_neg=4, batch_size=32, shuffle=True)
#len(train_generator)
word_embeddings = load_from_file(embedding_path+'glove.6B.50d.txt', mode= 'glove')

In [None]:
#history = model.fit_generator(train_generator, epochs=20, callbacks=[evaluate], workers=5, use_multiprocessing=False)

In [68]:
model = mz.models.DRMMTKS()
model.params['task'] = task
model.params['embedding_input_dim'] =  len(preprocessor.context['vocab_unit'].state['term_index']) +1 
model.params['embedding_output_dim'] = word_embeddings.output_dim
#model.params['embedding_output_dim'] = 11

model.params['top_k'] = 10
model.params['mlp_num_layers'] = 2
model.params['mlp_num_units'] = 20
model.params['mlp_num_fan_out'] = 10
model.params['mlp_activation_func'] = 'tanh'
model.params['optimizer'] = 'adadelta'
#model.params['mlp_num_units'] = mlp_units

model.params['embedding_trainable'] = True

model.guess_and_fill_missing_params(verbose=1)

model.params.update(preprocessor.context)
print('Model param',model.params)

model.build()
#model.load_embedding_matrix(embedding_matrix)
model.compile()
print('Model complete',model.params.completed())
model.backend.summary()

Model param model_class                   <class 'matchzoo.models.drmmtks.DRMMTKS'>
input_shapes                  [(15,), (30,)]
task                          Ranking Task
optimizer                     adadelta
with_embedding                True
embedding_input_dim           582696
embedding_output_dim          50
embedding_trainable           True
with_multi_layer_perceptron   True
mlp_num_units                 20
mlp_num_layers                2
mlp_num_fan_out               10
mlp_activation_func           tanh
mask_value                    -1
top_k                         10
Model complete True
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 15)           0                                            
_________________________________________________________________________________

In [None]:
train_generator = mz.DataGenerator(
                train_processed, mode='pair',num_dup=1, num_neg=10,batch_size=100
)
print('training data batches:', len(train_generator))
history = model.fit_generator(train_generator, epochs=5, \
                                  workers=4, use_multiprocessing=True)

In [71]:
x, y = train_processed.unpack()
test_x, test_y = test_processed.unpack()

In [75]:
model.fit(x, y, batch_size=1000, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fd0a610be80>

In [73]:
'''
    pred_x, pred_y = test_processed[:].unpack()
    evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_x))
    ?evaluate
'''

In [None]:
model.evaluate(test_x, test_y)

In [104]:
test_x

{'id_left': array(['L-0', 'L-1', 'L-2', ..., 'L-117', 'L-31', 'L-87'], dtype='<U5'),
 'text_left': array([[299379, 490362, 294996, ...,      0,      0,      0],
        [ 27720, 309675, 203865, ...,      0,      0,      0],
        [105424, 170437, 320625, ...,      0,      0,      0],
        ...,
        [550372, 490816, 478216, ...,      0,      0,      0],
        [346282, 367283, 451831, ...,      0,      0,      0],
        [393456, 511072,  80487, ...,      0,      0,      0]]),
 'length_left': array([11,  4,  3, ...,  4,  7,  4]),
 'id_right': array(['R-0', 'R-1', 'R-2', ..., 'R-121631', 'R-121632', 'R-121633'],
       dtype='<U8'),
 'text_right': array([[ 99882, 408547, 242273, ...,      0,      0,      0],
        [ 90299, 495356, 148956, ...,  75093,      0,      0],
        [464449, 391651,  82043, ..., 170437, 280762,  96984],
        ...,
        [319298, 188291, 524349, ..., 498453, 177608,      0],
        [ 88915, 261839, 562739, ..., 132953, 467108,  89296],
        [

In [113]:
test_x['text_right'].shape

(127632, 30)

In [111]:
test_y.shape

(127632, 1)

In [103]:
#test_predict = model.predict(test_x)

In [112]:
test_processed.left.loc['L-0']['text_left']

[299379,
 490362,
 294996,
 167647,
 514817,
 436877,
 146667,
 181003,
 390819,
 514817,
 1,
 0,
 0,
 0,
 0]

In [177]:
def run_model_with_topK(model, test_x, test_predict, test_processed, \
                        preprocessor, topk, doc_score_file, word_score_file):
    
    
    qid_did_score = {}
    vocab_unit = preprocessor.context['vocab_unit']
    for left_id, right_id, score in zip(test_x['id_left'], test_x['id_right'], test_predict):
        if left_id not in qid_did_score:
            qid_did_score[left_id] = {}
        qid_did_score[left_id][right_id] = score
    
    # get top 100
    doc_score_rows = []
    word_score_rows = []
    
    query_text_mapping = {}
    doc_text_mapping = {}
    
    for qid, dscore in qid_did_score.items():
        top_docs = sorted(dscore.items(), key = lambda x: x[1], reverse=True)
        qtext = ' '.join([vocab_unit.state['index_term'][i] for i in \
                          test_processed.left.loc[qid]['text_left']])
        query_text_mapping[qid] = qtext
        covered_tokens = {}
        for sentry in top_docs[:100]:
            doc_token_id = test_processed.right.loc[sentry[0]]['text_right']
            dtext = ' '.join([vocab_unit.state['index_term'][i] for i in doc_token_id])
            doc_text_mapping[sentry[0]] = dtext
            wid_left=[]
            wid_right=[]
            wtext_left=[]
            wtext_right=[]
            
            tid = 0
            right_len = len(doc_token_id)
            for token_id in doc_token_id:
                if token_id not in covered_tokens:
                    wid_left.append(qid)
                    wtext_left.append(test_processed.left.loc[qid]['text_left'])
                    wid_right.append('R-'+str(tid))
                    right_vector = np.zeros(right_len)
                    right_vector[0] = token_id
                    wtext_right.append(right_vector)
                    tid+=1
                    covered_tokens[token_id] = 0.0
                covered_tokens[token_id] += 1.0
            
            if len(wtext_right) > 0:
                word_test_object = {}
                word_test_object['text_left'] = np.array(wtext_left)
                word_test_object['text_right'] = np.array(wtext_right)
                #word_test_object['id_left'] = np.array(word_test_object['id_left'])
                #word_test_object['id_right'] = np.array(word_test_object['id_right'])
                
                wpred_scores = model.predict(word_test_object)
                for token_vector, score in zip(word_test_object['text_right'], wpred_scores):
                    word = vocab_unit.state['index_term'][token_vector[0]]
                    
                    if word!='<PAD>':
                        word_score_rows.append({'qid': qid, 'pid': sentry[0], 'word': word,\
                                            'wscore': score[0]})
            
            doc_score_rows.append({'qid': qid, 'pid': sentry[0], 'score': sentry[1][0]})
            
    pd.DataFrame(doc_score_rows).to_csv(doc_score_file,header=True, sep='\t', index=False)
    pd.DataFrame(word_score_rows).to_csv(word_score_file,header=True, sep='\t', index=False)
    del(doc_score_rows)
    
    
        
    qfile = open('query_id_mapping.csv','w')
    for qid, qtext in query_text_mapping.items():
        qfile.write(qid+'\t'+qtext+'\n')
    qfile.close()
    
    dfile = open('doc_id_mapping.csv','w')
    for did, dtext in doc_text_mapping.items():
        dfile.write(did+'\t'+dtext+'\n')
    dfile.close()
    

    # qid, query text, doc id , doc score
    

In [178]:
run_model_with_topK(model, test_x, test_predict, test_processed, preprocessor, \
                    100, 'drmm_doc_scores_top100.csv','drmm_word_scores_top100.csv')

* Remove the queries from test that have less than X documents. 
* MRR
* Add attention to the first layer. (use the queries to explain)
* Check attention after histogram layer. (use the queries to explain)
* unit of explanation: word/histogram 