In [65]:
%pwd

'/home/heeh/Projects/da'

### Filtering I (BM25 Retrieval)

In [102]:
import sys
import json
import requests
################################################################################
# Filtering
################################################################################
pretty = lambda x : json.dumps(x, indent=2, sort_keys=True)
solr_select = 'http://localhost:8983/solr/depcc/select?q='
train_path  = 'data/02-acl-arc/train.jsonl'
with open(train_path, 'r') as train_file:
    json_lines = []
    lines = train_file.readlines()
    for line in lines:
        j = json.loads(line)
        json_lines.append(j)
N = len(json_lines)
print(N)
j = json_lines[0]
query = j['text'].replace(' ', '+')
rp_retrieval = requests.get(solr_select + query).json()
cc_docs = (rp_retrieval['response']['docs'])
print('Number of retrieved documents: %d' % len(cc_docs))
cc_doc0 = json.loads(cc_docs[0]['_src_'])
#print(cc_doc0)
################################################################################

1688
Number of retrieved documents: 10


### Segmentation (Documents -> Passages)

In [67]:
################################################################################
# Segmentation
################################################################################
val_url = cc_doc0['url']
print(val_url)
val_s3 = cc_doc0['s3']
print(val_s3)
cc_words = cc_doc0['text'].split()
cc_psgs = []
num_psgs = len(cc_words) // 100
print('Number of passages in document 0: %d ' % num_psgs)
for i in range(num_psgs):
    start = i * 100
    end   = -1 if (i == num_psgs-1) else (i+1) * 100
    psg = ' '.join(cc_words[start:end])
    dict = {'doc_id' : val_url, 'doc_text'  : psg,  'title': ''  }
    cc_psgs.append(dict)

http://technokoopa.deviantart.com/art/Dragoon-class-Destroyer-448332152
s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2016-07/segments/1454701156520.89/warc/CC-MAIN-20160205193916-00243-ip-10-236-182-209.ec2.internal.warc.gz
Number of passages in document 0: 54 


### Encoding

In [92]:
################################################################################
# Encoding
################################################################################

import csv
import subprocess

# Encode CC
# f_out = 'emb/cc_sample.tsv'
# with open(f_out, 'w') as output_file:
#     dw = csv.DictWriter(output_file, cc_psgs[0].keys(), delimiter='\t')
#     for psg in cc_psgs:
#         dw.writerow(psg)
# subprocess.call(['sh', 'emb/generate_embedding.sh ' + 'cc_sample'])

# Encode train set
num_train = len(json_lines)
train_psgs = []
for i in range(num_train):
    train_dict = {'doc_id': str(i), 'doc_text': json_lines[i]['text'], 'title': ''}
    train_psgs.append(train_dict)
#print(train_psgs[0])

f_train_out = 'emb/train_sample.tsv'
with open(f_train_out, 'w') as output_file:
    dw = csv.DictWriter(output_file, train_psgs[0].keys(), delimiter='\t')
    for tp in train_psgs:
        dw.writerow(tp)
subprocess.call(['sh', 'emb/generate_embedding.sh ' + 'train_sample'])

127

### Nearest Neighbors

In [1]:
################################################################################
# Nearest Neighbor (FAISS)
################################################################################
#print(emb[0])
import numpy as np
import faiss

# Read CC embeddings (DATABASE)
cc_embeddings = np.load('emb/cc_sample_0.pkl', allow_pickle=True)
print(cc_embeddings[0][1].shape)  # Dimension of the embedding
nb = len(cc_embeddings) # database size
d = cc_embeddings[0][1].size
print(nb,d)
xb = np.zeros((nb,d), dtype='float32')
for i in range(nb):
    xb[i] = cc_embeddings[i][1]
print('Number of CC passages: %d' % nb)

# Read train embeddings (QUERY)
train_embeddings = np.load('emb/train_sample_0.pkl', allow_pickle=True)
print(train_embeddings[0][1].shape)  # Dimension of the embedding
nq = len(train_embeddings[:10]) # database size
d = train_embeddings[0][1].size
print(nq,d)
xq = np.zeros((nq,d), dtype='float32')
for i in range(nq):
    xq[i] = train_embeddings[i][1]

print('Number of train passages: %d' % nq)
print(xq)


index = faiss.IndexFlatL2(d)   # build the index

print('trained? %r' % index.is_trained)
index.add(xb)                  # add vectors to the index
print('Total number of indexed CC passages: ', index.ntotal)
print()
print('Using an indentical CC set')
k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print('4 nearest neighbors')
print(I)
print()

print('distances(sanity check)')
print(D)
print()
print('Using the query(train set)')
D, I = index.search(xq, k)     # actual search
print('4 nearest neighbors')
print(I[:5])                   # neighbors of the 5 first queries

(768,)
54 768
Number of CC passages: 54
(768,)
10 768
Number of train passages: 10
[[-8.7804133e-01  5.2036103e-02 -6.7804471e-02 ... -5.1974082e-01
  -2.2222115e-01 -2.3715140e-02]
 [-7.5261265e-01 -1.5401670e-01  3.3758375e-01 ... -7.0269495e-01
  -6.7983367e-02  2.2842372e-03]
 [-4.3047079e-01  7.1437828e-02 -3.3563085e-02 ... -5.2040094e-01
  -7.8418620e-02  1.7523374e-01]
 ...
 [-1.9698891e-01  2.3213519e-01  3.3778602e-01 ... -4.7779170e-01
  -1.2492716e-04 -3.6839178e-01]
 [-3.2815686e-01  2.1539052e-01  2.5606854e-02 ... -3.9178729e-01
  -1.1939787e-01  2.3145039e-01]
 [-3.6438248e-01  4.4869799e-03  4.0559188e-01 ... -4.8237056e-01
  -5.3450578e-01 -1.0792198e-01]]
trained? True
Total number of indexed CC passages:  54

Using an indentical CC set
4 nearest neighbors
[[ 0 13  4 24]
 [ 1  2 13 40]
 [ 2  1 40 18]
 [ 3 40 31 18]
 [ 4  0 40 13]]

distances(sanity check)
[[ 0.       36.441696 37.077488 41.76454 ]
 [ 0.       36.791702 39.25956  39.92917 ]
 [ 0.       36.791702 39.89