In [1]:
from tqdm import tqdm

In [2]:
from datasets import load_dataset

ds = load_dataset("microsoft/ms_marco", "v2.1")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds.keys()

dict_keys(['validation', 'train', 'test'])

In [4]:
ds['train'].column_names

['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers']

In [5]:
ds['train'][0]

{'answers': ['The immediate impact of the success of the manhattan project was the only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.'],
 'passages': {'is_selected': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'passage_text': ['The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
   'The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.',
   'Essay on The Manhattan Project - The Manhattan Project The Manhattan Project was to see if making an atomic bomb possible. The success of th

In [6]:
ds['train'][0]['passages']['passage_text']

['The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.',
 'The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.',
 'Essay on The Manhattan Project - The Manhattan Project The Manhattan Project was to see if making an atomic bomb possible. The success of this project would forever change the world forever making it known that something this powerful can be manmade.',
 'The Manhattan Project was the name for a project conducted during World War II, to develop the first atomic bomb. It refers specifically to the period of the project from 194 … 2-1946 under the control of the U.S. Army Corps of Engin

In [7]:
len(ds['train'])

808731

In [8]:
train = ds['train']
valid = ds['validation']
test = ds['test']
print(len(train), len(valid), len(test))


808731 101093 101092


In [9]:
import random
#random seed
random.seed(42)

In [10]:
ds['train'][1]['passages']['url']

['https://www.justice.gov/ovw/file/926101/download',
 'https://quizlet.com/1128245/criminal-justice-exam-1-flash-cards/',
 'http://restorativejustice.org/restorative-justice/about-restorative-justice/tutorial-intro-to-restorative-justice/',
 'https://www.ojjdp.gov/pubs/implementing/accountability.html',
 'http://www.westerncriminology.org/documents/WCR/v01n1/Umbreit/Umbreit.html',
 'https://www.sciencedirect.com/science/article/pii/B9781455731398000030',
 'https://en.wikipedia.org/wiki/Restorative_justice',
 'http://www.adrac.org.au/adr-mapping/criminal-justice-and-adr',
 'https://www.mediate.com/articles/kirschnersbl20180126.cfm',
 'https://www.sciencedirect.com/science/article/pii/B978145572599100014X']

In [45]:
train_triples = []
for i in tqdm(range(0,len(train)//10)):
    query = ds['train'][i]['query']
    for k, passage in enumerate(ds['train'][i]['passages']['passage_text']):
        # print(passage)
        sample = {}
        sample['query'] = query
        sample['positive'] = passage
        sample['positive_url'] = ds['train'][i]['passages']['url'][k]
        while True:
            random_ind = random.randint(0, len(ds['train'])-1)
            if random_ind != i:
                break

        negatives = ds['train'][random_ind]['passages']['passage_text']
        #make random selection of these passages
        sample['negative'] = random.choice(negatives)
        sample['negative_url'] = ds['train'][random_ind]['passages']['url'][negatives.index(sample['negative'])]
        train_triples.append(sample)

100%|██████████| 80873/80873 [02:15<00:00, 596.69it/s]


In [46]:
#save the train triples
import json
json.dump(train_triples, open('train_triples_0_10.json', 'w'))

In [47]:
json.dump(train_triples[:5], open('train_triples_sample.json', 'w'))

In [None]:
#validation triples
valid_triples = []
for i in tqdm(range(len(valid)//3)):
    query = ds['validation'][i]['query']
    for k, passage in enumerate(ds['validation'][i]['passages']['passage_text']):
        # print(passage)
        sample = {}
        sample['query'] = query
        sample['positive'] = passage
        sample['positive_url'] = ds['validation'][i]['passages']['url'][k]
        while True:
            random_ind = random.randint(0, len(ds['validation'])-1)
            if random_ind != i:
                break

        negatives = ds['validation'][random_ind]['passages']['passage_text']
        #make random selection of these passages
        sample['negative'] = random.choice(negatives)
        sample['negative_url'] = ds['validation'][random_ind]['passages']['url'][negatives.index(sample['negative'])]
        valid_triples.append(sample)

## Make the vocab

In [11]:
from utils import tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuliagoryachev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
tokenize(")what was the immediate impact of the success of the manhattan project?", {})

['immediate', 'impact', 'success', 'manhattan', 'project']

In [13]:
queries_t = []
for i in tqdm(range(0,len(train))):
    queries_t.append(train[i]['query'])

for i in tqdm(range(0,len(valid))):
    queries_t.append(valid[i]['query'])

for i in tqdm(range(0,len(test))):
    queries_t.append(test[i]['query'])

queries_t = list(set(queries_t))
queries_t = ' '.join(queries_t)
queries_words = tokenize(queries_t, {})

100%|██████████| 808731/808731 [00:40<00:00, 20187.79it/s]
100%|██████████| 101093/101093 [00:04<00:00, 20473.30it/s]
100%|██████████| 101092/101092 [00:04<00:00, 21340.94it/s]


In [17]:
#dump all words to a file
import json
json.dump(list(set(queries_words)), open('queries_words.json', 'w'))

In [18]:
len(set(queries_words))

152407

In [19]:
passages_words = []
for i in tqdm(range(0,len(train))):
    passages = ' '.join((train[i]['passages']['passage_text']))
    w = tokenize(passages, {})
    passages_words.extend(w)

for i in tqdm(range(0,len(valid))):
    passages = ' '.join((train[i]['passages']['passage_text']))
    w = tokenize(passages, {})
    passages_words.extend(w)

for i in tqdm(range(0,len(test))):
    passages = ' '.join((train[i]['passages']['passage_text']))
    w = tokenize(passages, {})
    passages_words.extend(w)

  0%|          | 0/808731 [00:00<?, ?it/s]

100%|██████████| 808731/808731 [07:11<00:00, 1873.66it/s]
100%|██████████| 101093/101093 [00:53<00:00, 1882.03it/s]
100%|██████████| 101092/101092 [00:51<00:00, 1956.04it/s]


In [20]:
#dump all words to a file
json.dump(list(set(passages_words)), open('passages_words.json', 'w'))

In [2]:
import json

qwords = json.load(open('queries_words.json'))

In [3]:
pwords = json.load(open('passages_words.json'))

In [4]:
len(pwords)

1134700

In [5]:
all_words = list(set(qwords + pwords))

In [6]:
print(len(all_words))

1149755


In [7]:
import gensim.downloader as api

model = api.load("word2vec-google-news-300")

In [8]:
word_to_ids = {v: i+1 for i,v in enumerate(all_words)}
word_to_ids['<unk>'] = 0

In [9]:
idx_to_word = {v: k for k, v in word_to_ids.items()}

In [10]:
import numpy as np
embeds = {}
for v, ind in word_to_ids.items():
    if v in model:
        embeds[ind] = model[v]
    else:
        embeds[ind] = np.zeros(300)

In [11]:
#save all
import joblib

joblib.dump(embeds, 'embeds.pkl')
joblib.dump(word_to_ids, 'word_to_ids.pkl')
joblib.dump(idx_to_word, 'idx_to_word.pkl')

['idx_to_word.pkl']