In [40]:
import matchzoo as mz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


##### Fetch the dataset

In [3]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv

--2019-11-07 16:45:28--  https://msmarco.blob.core.windows.net/msmarcoranking/qrels.train.tsv
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10589532 (10M) [text/tab-separated-values]
Saving to: ‘qrels.train.tsv’


2019-11-07 16:45:29 (18.8 MB/s) - ‘qrels.train.tsv’ saved [10589532/10589532]



In [4]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz

--2019-11-07 16:45:48--  https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1057717952 (1009M) [application/gzip]
Saving to: ‘collectionandqueries.tar.gz’


2019-11-07 16:46:46 (17.6 MB/s) - ‘collectionandqueries.tar.gz’ saved [1057717952/1057717952]



In [5]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2019-11-07 17:02:20--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2019-11-07 17:02:20--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2019-11-07 17:02:20--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2019-1

In [6]:
##### Load the embeddings
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [8]:
import matchzoo as mz
import pandas as pd

from sklearn.utils import shuffle
print(mz.__version__)
from scipy.spatial.distance import cosine
from matchzoo.metrics import Precision


2.2.0


##### Define functions and metrics


In [9]:
"""Precision for ranking."""
import numpy as np

from matchzoo.engine.base_metric import BaseMetric, sort_and_couple


class PrecisionCheck(BaseMetric):
    """Precision metric."""

    ALIAS = 'precision'

    def __init__(self, k: int = 1, threshold: float = 0.):
        """
        :class:`PrecisionMetric` constructor.
        :param k: Number of results to consider.
        :param threshold: the label threshold of relevance degree.
        """
        self._k = k
        self._threshold = threshold

    def __repr__(self) -> str:
        """:return: Formated string representation of the metric."""
        return f"{self.ALIAS}@{self._k}({self._threshold})"

    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
        #print('number of results', y_true.shape)
        """
        Calculate precision@k.
        Example:
            >>> y_true = [0, 0, 0, 1]
            >>> y_pred = [0.2, 0.4, 0.3, 0.1]
            >>> Precision(k=1)(y_true, y_pred)
            0.0
            >>> Precision(k=2)(y_true, y_pred)
            0.0
            >>> Precision(k=4)(y_true, y_pred)
            0.25
            >>> Precision(k=5)(y_true, y_pred)
            0.2
        :param y_true: The ground true label of each document.
        :param y_pred: The predicted scores of each document.
        :return: Precision @ k
        :raises: ValueError: len(r) must be >= k.
        """
        if self._k <= 0:
            raise ValueError(f"k must be greater than 0."
                             f"{self._k} received.")
        # sorted list of pairs.
        coupled_pair = sort_and_couple(y_true, y_pred)
        
        precision = 0.0
        
        for idx, (label, score) in enumerate(coupled_pair):
            
            #if idx < 10:
            #    print('[',label, score, ']', )
            if idx >= self._k:
                break
            if label > self._threshold:
                precision += 1.
        return precision / self._k
    
    
"""Recall for ranking."""
import numpy as np

from matchzoo.engine.base_metric import BaseMetric, sort_and_couple


class RecallCheck(BaseMetric):
    """Recall metric."""

    ALIAS = 'recall'

    def __init__(self, k: int = 1, threshold: float = 0.):
        """
        :class:`RecallMetric` constructor.
        :param k: Number of results to consider.
        :param threshold: the label threshold of relevance degree.
        """
        self._k = k
        self._threshold = threshold

    def __repr__(self) -> str:
        """:return: Formated string representation of the metric."""
        return f"{self.ALIAS}@{self._k}({self._threshold})"

    def __call__(self, y_true: np.array, y_pred: np.array) -> float:
        if self._k == 5:
            print('number of results', y_true.shape, (y_true > self._threshold ).sum())
        """
        Calculate Recall@k.
        Example:
            >>> y_true = [0, 0, 0, 1]
            >>> y_pred = [0.2, 0.4, 0.3, 0.1]
            >>> Recall(k=1)(y_true, y_pred)
            0.0
            >>> Recall(k=2)(y_true, y_pred)
            0.0
            >>> Recall(k=4)(y_true, y_pred)
            1.0
            >>> Recall(k=5)(y_true, y_pred)
            1.0
        :param y_true: The ground true label of each document.
        :param y_pred: The predicted scores of each document.
        :return: Recall @ k
        :raises: ValueError: len(r) must be >= k.
        """
        if self._k <= 0:
            raise ValueError(f"k must be greater than 0."
                             f"{self._k} received.")
        # sorted list of pairs.
        coupled_pair = sort_and_couple(y_true, y_pred)
        
        recall = 0.0
        rel_docs = (y_true > self._threshold ).sum() * 1.0
        for idx, (label, score) in enumerate(coupled_pair):
            
            if idx < 5 and self._k == 5:
                print('[',label, score, ']', )
            if idx >= self._k:
                break
            if label > self._threshold:
                recall += 1.
        return recall / rel_docs

In [10]:

"""Matchzoo toolkit for token embedding."""

import csv
import typing

import numpy as np
import pandas as pd


class Embedding(object):
    """
    Embedding class.
    Examples::
        >>> import matchzoo as mz
        >>> train_raw = mz.datasets.toy.load_data()
        >>> pp = mz.preprocessors.NaivePreprocessor()
        >>> train = pp.fit_transform(train_raw, verbose=0)
        >>> vocab_unit = mz.build_vocab_unit(train, verbose=0)
        >>> term_index = vocab_unit.state['term_index']
        >>> embed_path = mz.datasets.embeddings.EMBED_RANK
    To load from a file:
        >>> embedding = mz.embedding.load_from_file(embed_path)
        >>> matrix = embedding.build_matrix(term_index)
        >>> matrix.shape[0] == len(term_index) + 1
        True
    To build your own:
        >>> data = pd.DataFrame(data=[[0, 1], [2, 3]], index=['A', 'B'])
        >>> embedding = mz.Embedding(data)
        >>> matrix = embedding.build_matrix({'A': 2, 'B': 1})
        >>> matrix.shape == (3, 2)
        True
    """

    def __init__(self, data: pd.DataFrame):
        """
        Embedding.
        :param data: DataFrame to use as term to vector mapping.
        """
        self._data = data

    @property
    def input_dim(self) -> int:
        """:return Embedding input dimension."""
        return self._data.shape[0]

    @property
    def output_dim(self) -> int:
        """:return Embedding output dimension."""
        return self._data.shape[1]

    def build_matrix(
        self,
        term_index: typing.Union[
            dict, mz.preprocessors.units.Vocabulary.TermIndex],
        initializer=lambda: np.random.uniform(-0.2, 0.2)
    ) -> np.ndarray:
        """
        Build a matrix using `term_index`.
        :param term_index: A `dict` or `TermIndex` to build with.
        :param initializer: A callable that returns a default value for missing
            terms in data. (default: a random uniform distribution in range)
            `(-0.2, 0.2)`).
        :return: A matrix.
        """
        input_dim = len(term_index) + 1
        
        print('Embedding to matrix, input & output', input_dim, self.output_dim)
        
        matrix = np.empty((input_dim, self.output_dim))
        for index in np.ndindex(*matrix.shape):
            matrix[index] = initializer()

        #valid_keys = set(self._data.index)
        terms_to_find = term_index.keys()
        valid_data_frame = self._data[self._data.index[-len(self._data):].isin(terms_to_find)]
        
        nf_count = 0
        total = 0
        
        for term, values in valid_data_frame.iterrows():
            if total % 50000==0:
                print('Words completed', total)
            total +=1
                
            try:
                matrix[term_index[term]] = values
            except Exception as ex:
                nf_count+=1
                print('Word not found', term)
                    
        print('Words not found in embedding ', nf_count)
        
        return matrix


def load_from_file(file_path: str, mode: str = 'word2vec') -> Embedding:
    """
    Load embedding from `file_path`.
    :param file_path: Path to file.
    :param mode: Embedding file format mode, one of 'word2vec' or 'glove'.
        (default: 'word2vec')
    :return: An :class:`matchzoo.embedding.Embedding` instance.
    """
    if mode == 'word2vec':
        data = pd.read_csv(file_path,
                           sep=" ",
                           index_col=0,
                           header=None,
                           skiprows=1)
        print(data.head())
    elif mode == 'glove':
        data = pd.read_csv(file_path,
                           sep=" ",
                           index_col=0,
                           header=None,
                           quoting=csv.QUOTE_NONE)
    else:
        raise TypeError(f"{mode} is not a supported embedding type."
                        f"`word2vec` or `glove` expected.")
    return Embedding(data)





In [15]:
import ast
slist = ast.literal_eval(open('msmarco_data/long_stop_words','r').read())

stop_words = list( set(['are', 'get','you','dont','want','take','have','need','let','your','their','theirs',\
              'still','these','that','could','should', 'would', 'with','does','this','used','make',\
              'makes','made','takes','take','those','when','without','more','becasue','there','aren',\
              'keep','seem','seems','wont','shouldn','shouldnt','only','than','know','every','also',\
              'brand','become','most','other','others','meant','thing','things','happens','anything',\
              'gets','sets','both','bring','then','goes','some','someone','see','article','redirect',\
              'sent','into','about','what','where','give','going','like','look','looks','having','other',\
              'gives','give','given','uses','used','through','though','very','doesn','many','even','mine',\
              'myself','always', 'self','currently','along','else','comes','come','came','likes','like',\
              'because','can','the', 'an', 'to', 'and', 'from', 'for', 'we', 'you', 'i', 'so','such',\
               'a', 'at', 'b', 'be', 'in', 'of', 'on', 'was', 'is','been','while','will','they','them']) | \
                set(slist))
print(len(stop_words))

193


In [20]:
! tar -xvzf msmarco_data/collectionandqueries.tar.gz

collection.tsv
qrels.dev.small.tsv
qrels.train.tsv
queries.dev.small.tsv
queries.dev.tsv
queries.eval.small.tsv
queries.eval.tsv
queries.train.tsv


In [21]:
! head -n 1 collection.tsv

0	The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.


In [25]:
!head -n 5 qrels.train.tsv

1185869	0	0	1
1185868	0	16	1
597651	0	49	1
403613	0	60	1
1183785	0	389	1


In [26]:
! head -n 5 queries.train.tsv

121352	define extreme
634306	what does chattel mean on credit history
920825	what was the great leap forward brainly
510633	tattoo fixers how much does it cost
737889	what is decentralization process.


##### Define paths

In [34]:
path='/home/manishav/irexplain/src/main/python/deep_explain/msmarco_data/'
collection_path = path+ 'collection/'
embedding_path =  path+ 'embedding/'

#### Load queries

In [58]:
import re
query_list = []
match_words = r'define|what|when'

for line in open(collection_path+'queries.train.tsv','r'):
    split = line.strip().split('\t')
    
    match = re.search(match_words,split[1])
    if not match:
        query_list.append({'qid':split[0], 'query': split[1].strip()})

In [59]:
query_frame = pd.DataFrame(query_list)

In [60]:
query_frame.head()

Unnamed: 0,qid,query
0,510633,tattoo fixers how much does it cost
1,278900,how many cars enter the la jolla concours d' e...
2,303205,how much can i contribute to nondeductible ira
3,492875,sanitizer temperature
4,54528,blood clots in urine after menopause


In [61]:
query_frame.shape

(478643, 2)

#### load qrels

In [47]:
qrels = []
for line in open(collection_path+'qrels.train.tsv','r'):
    split = line.strip().split('\t')
    qrels.append({'qid':split[0], 'pid': split[2], 'rel':split[3] })
qrel_frame = pd.DataFrame(qrels) 

In [48]:
qrel_frame.head()

Unnamed: 0,qid,pid,rel
0,1185869,0,1
1,1185868,16,1
2,597651,49,1
3,403613,60,1
4,1183785,389,1


In [50]:
qid_pid_rel_frame = pd.merge(query_frame, qrel_frame, left_on='qid', right_on='qid', suffixes=('', '_y'))

In [52]:
qid_pid_rel_frame.shape

(295737, 4)

In [55]:
qid_pid_rel_frame.columns

Index(['qid', 'query', 'pid', 'rel'], dtype='object')

In [56]:
qrel_frame[qrel_frame['qid'].isin(query_frame['qid'].tolist())].shape

(295737, 3)

##### Load documents 

In [62]:
pid_set = set(qid_pid_rel_frame['pid'].tolist())

paragraphs = []
for line in open(collection_path+'collection.tsv','r'):
    split = line.strip().split('\t')
    if split[0] in pid_set:
        paragraphs.append({'pid': split[0],\
                           'paragraph': split[1].strip()})
        
para_frame = pd.DataFrame(paragraphs) 

In [63]:
para_frame.shape

(288274, 2)

In [64]:
qid_pid_para_rel_frame = pd.merge(qid_pid_rel_frame, para_frame, left_on='pid', right_on='pid', suffixes=('', '_y'))

In [65]:
qid_pid_para_rel_frame.head()

Unnamed: 0,qid,query,pid,rel,paragraph
0,510633,tattoo fixers how much does it cost,1879754,1,1 Most tattoo artists charge an hourly rate th...
1,303205,how much can i contribute to nondeductible ira,6487240,1,Nondeductible IRA Contributions. For a traditi...
2,492875,sanitizer temperature,1147449,1,Chlorine sanitizing solutions should be at a m...
3,54528,blood clots in urine after menopause,2984158,1,WebMD Symptom Checker helps you find the most ...
4,507001,symptoms of an enlarged heart in dogs,556790,1,A canine enlarged heart is a literal stage of ...


In [66]:
del(para_frame)

In [70]:
qid_pid_para_rel_frame[qid_pid_para_rel_frame['rel'] == '1'].shape

(295737, 5)

#### Prepare train and test_frame for the model training

In [80]:
from sklearn.utils import shuffle

final_train_frame = shuffle(qid_pid_para_rel_frame[['query','paragraph','rel']].sample(50000))

In [83]:
final_train_frame = final_train_frame.reset_index(drop=True)
final_train_frame.columns=['text_left','text_right','label']

In [84]:
final_train_frame.head()

Unnamed: 0,text_left,text_right,label
0,how long does a dollar bill remain in circulation,The study cited a comparison between the lifes...,1
1,where are oil refineries located,"Currently, the world's largest oil refinery is...",1
2,who plays caleb brewster in turn,"He is joined by Jamie Bell, who plays the role...",1
3,weather in makkah,The following are extreme weather events in Me...,1
4,is project manager salary,The average salary for project manager jobs is...,1


In [85]:
### sample queries for test (should not overlap with train)
train_query_list = final_train_frame['text_left'].tolist()
final_test_frame = shuffle(qid_pid_para_rel_frame[~qid_pid_para_rel_frame['query'].isin(train_query_list)]\
                            [['query','paragraph','rel']].sample(5000))

In [87]:
final_test_frame = final_test_frame.reset_index(drop=True)
final_test_frame.columns=['text_left','text_right','label']
final_test_frame.head()

Unnamed: 0,text_left,text_right,label
0,menopause dryness symptoms,Women who need relief from other significant s...,1
1,how much does brain mri mra,Average Cost of Brain MRI The average cost of ...,1
2,windows installed cost,The cost to Install Replacement Windows starts...,1
3,distance raleigh to greensboro,"Driving distance from Raleigh, NC to Greensbor...",1
4,how long does it take for my package to arrive...,"Using First Class Mail International, deliveri...",1


In [88]:
train_raw =  mz.pack(final_train_frame)# mz.datasets.toy.load_data(stage='train', task=task)
test_raw =  mz.pack(final_test_frame)#mz.datasets.toy.load_data(stage='test', task=task)

In [89]:
train_raw.left.head()

Unnamed: 0_level_0,text_left
id_left,Unnamed: 1_level_1
L-0,how long does a dollar bill remain in circulation
L-1,where are oil refineries located
L-2,who plays caleb brewster in turn
L-3,weather in makkah
L-4,is project manager salary


#### Preprocess the text

In [99]:
preprocessor = mz.preprocessors.BasicPreprocessor(fixed_length_left=15,
             fixed_length_right=30,
             filter_mode='idf',
             filter_low_freq=2,
             filter_high_freq=1000, remove_stop_words=True)


In [None]:
preprocessor.fit(train_raw)

In [101]:
len(preprocessor.context['vocab_unit'].state['term_index'])

123332

In [102]:
train_processed = preprocessor.transform(train_raw,verbose=0)
test_processed = preprocessor.transform(test_raw,verbose=0)

In [103]:
vocab_unit = preprocessor.context['vocab_unit']
print('Orig Text:', train_processed.left.loc['L-0']['text_left'])
sequence = train_processed.left.loc['L-0']['text_left']
print('Transformed Indices:', sequence)
print('Transformed Indices Meaning:',
      '_'.join([vocab_unit.state['index_term'][i] for i in sequence]))

Orig Text: [64792, 50115, 114579, 80830, 68510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Transformed Indices: [64792, 50115, 114579, 80830, 68510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Transformed Indices Meaning: long_dollar_bill_remain_circulation_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>_<PAD>


##### Define the ranking task

In [116]:

task = mz.tasks.Ranking()
task.metrics = [PrecisionCheck(k=3, threshold=0),
    PrecisionCheck(k=5, threshold=0),
    RecallCheck(k=3, threshold=0),
    RecallCheck(k=5, threshold=0),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3, threshold=0),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5, threshold=0),
]
print(task)

Ranking Task


In [110]:
pred_x, pred_y = test_processed[:].unpack()
evaluate = mz.callbacks.EvaluateAllMetrics(model, x=pred_x, y=pred_y, batch_size=len(pred_x))

In [None]:
evaluate

In [114]:
! ls {embedding_path}

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt


In [115]:
#train_generator = mz.PairDataGenerator(train_processed, num_dup=1, num_neg=4, batch_size=32, shuffle=True)
#len(train_generator)
word_embeddings = load_from_file(embedding_path+'glove.6B.50d.txt', mode= 'glove')

In [None]:
#history = model.fit_generator(train_generator, epochs=20, callbacks=[evaluate], workers=5, use_multiprocessing=False)

In [119]:
model = mz.models.DRMMTKS()
model.params['task'] = task
model.params['embedding_input_dim'] =  len(preprocessor.context['vocab_unit'].state['term_index']) +1 
model.params['embedding_output_dim'] = word_embeddings.output_dim
#model.params['embedding_output_dim'] = 11

model.params['top_k'] = 10
model.params['mlp_num_layers'] = num_layers
model.params['mlp_num_units'] = 20
model.params['mlp_num_fan_out'] = 10
model.params['mlp_activation_func'] = 'tanh'
model.params['optimizer'] = 'adadelta'
#model.params['mlp_num_units'] = mlp_units

model.params['embedding_trainable'] = True

model.guess_and_fill_missing_params(verbose=1)

model.params.update(preprocessor.context)
print('Model param',model.params)

model.build()
#model.load_embedding_matrix(embedding_matrix)
model.compile()
print('Model complete',model.params.completed())
model.backend.summary()

Model param model_class                   <class 'matchzoo.models.drmmtks.DRMMTKS'>
input_shapes                  [(15,), (30,)]
task                          Ranking Task
optimizer                     adadelta
with_embedding                True
embedding_input_dim           123332
embedding_output_dim          50
embedding_trainable           True
with_multi_layer_perceptron   True
mlp_num_units                 20
mlp_num_layers                2
mlp_num_fan_out               10
mlp_activation_func           tanh
mask_value                    -1
top_k                         10
Model complete True
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_left (InputLayer)          (None, 15)           0                                            
_________________________________________________________________________________

In [120]:
x, y = train_processed.unpack()
test_x, test_y = test_processed.unpack()

In [121]:
model.fit(x, y, batch_size=1000, epochs=7)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.callbacks.History at 0x7f15474d1748>

In [123]:
model.evaluate(x, y)

TypeError: '>' not supported between instances of 'numpy.str_' and 'int'

In [None]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.tar.gz

--2019-11-07 20:23:09--  https://msmarco.blob.core.windows.net/msmarcoranking/qidpidtriples.train.full.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 40.112.152.16
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|40.112.152.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2633557579 (2.5G) [application/octet-stream]
Saving to: ‘qidpidtriples.train.full.tar.gz’
