In [1]:
%pwd
!pip install requests



### Filtering I (BM25 Retrieval)

In [2]:
import sys
import json
import requests
import time
start_time = time.time()
################################################################################
# Filtering
################################################################################
pretty = lambda x : json.dumps(x, indent=2, sort_keys=True)
solr_select = 'http://localhost:8983/solr/depcc-small/select?q='
#train_path  = 'data/04-hyper/train.jsonl'
train_path  = 'data/02-acl-arc/train.jsonl'
#train_path  = 'data/07-imdb/train.jsonl'
with open(train_path, 'r') as train_file:
    json_lines = []
    lines = train_file.readlines()
    for line in lines:
        j = json.loads(line)
        json_lines.append(j)
N = len(json_lines)
print(N)

# Query with training example
j = json_lines[0]
query = j['text'].replace(' ', '+')
#query = ""
#with open('data/02-acl-arc/lda_union.txt', 'r') as file:
#    query = file.read()
print(query)
#print(len(query.split()))
#sys.exit()
rp_retrieval = requests.get(solr_select + query).json()
cc_docs = (rp_retrieval['response']['docs'])
print('Number of retrieved documents: %d' % len(cc_docs))
cc_doc0 = json.loads(cc_docs[0]['_src_'])
cc_doc100 = ""
for i in range(10):
    cc_doc100 += json.loads(cc_docs[i]['_src_'])['text']
print(cc_doc100)
print("--- %s seconds ---" % (time.time() - start_time))
################################################################################


1688
Thus+,+over+the+past+few+years+,+along+with+advances+in+the+use+of+learning+and+statistical+methods+for+acquisition+of+full+parsers+(+Collins+,+1997+;+Charniak+,+1997a+;+Charniak+,+1997b+;+Ratnaparkhi+,+1997+)+,+significant+progress+has+been+made+on+the+use+of+statistical+learning+methods+to+recognize+shallow+parsing+patterns+syntactic+phrases+or+words+that+participate+in+a+syntactic+relationship+(+Church+,+1988+;+Ramshaw+and+Marcus+,+1995+;+Argamon+et+al.+,+1998+;+Cardie+and+Pierce+,+1998+;+Munoz+et+al.+,+1999+;+Punyakanok+and+Roth+,+2001+;+Buchholz+et+al.+,+1999+;+Tjong+Kim+Sang+and+Buchholz+,+2000+)+.
Number of retrieved documents: 10
Over the past fifteen years there has been significant progress in the field of statistical parsing . Much of the work has focussed on supervised methods , where by ' ' supervised ' ' we mean that the training data consists of sentences and their associated syntactic trees ( for example , Charniak 1997 , Collins 1999 , Roark and Johnson 1999 ) . T

### Segmentation by sentences(Documents -> Passages)

In [4]:
#!python -m spacy download en_core_web_sm
import spacy

nlp = spacy.load("en_core_web_sm", exclude=["parser"])
nlp.enable_pipe("senter")
doc = nlp(cc_doc100)
cc_psgs = []
psg = ''
num_tokens = 0
for sent in doc.sents:
    if num_tokens < 100:
        psg += sent.text
        num_tokens += len(sent)
    else:
#        print(num_tokens)
        cc_psgs.append({'doc_id' : '', 'doc_text'  : psg,  'title': ''  })
        num_tokens = 0
        psg = ''
#print(len(cc_psgs))

### Encoder

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
def encode(X):
    encoder = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.3)
    encoder.fit(X)
    print("Dimension: ", len(encoder.vocabulary_))
    embedding = encoder.transform(X).toarray()
    return embedding


In [2]:
import csv
train_psgs = []
cc_psgs = []
with open("emb/train_sample.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        train_psgs.append(row[1])


with open("emb/cc_sample.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        cc_psgs.append(row[1])


MAX_TR_PSGS = len(train_psgs)
MAX_CC_PSGS = len(cc_psgs)
print('train psg length %d' % MAX_TR_PSGS)
print('cc psg length %d' % MAX_CC_PSGS)

train psg length 1688
cc psg length 54





### Sparse Encoding on train+CC

In [18]:
emb = encode(train_psgs + cc_psgs)
print(emb.shape)
train_embeddings = emb[:1688]
cc_embeddings = emb[1688:]

print(train_embeddings.shape)
print(cc_embeddings.shape)

Dimension:  26613
(1742, 26613)
(1688, 26613)
(54, 26613)


### Nearest Neighbors

In [19]:
################################################################################
# Nearest Neighbor (FAISS)
################################################################################
#print(emb[0])
import numpy as np
import faiss

# Read CC embeddings (DATABASE)
#cc_embeddings = np.load('emb/cc_sample_0.pkl', allow_pickle=True)
nb = cc_embeddings.shape[0] # database size
d =  cc_embeddings.shape[1]
print(nb,d)
xb = np.array(cc_embeddings, dtype='float32')
print('Number of CC passages: %d' % nb)

# Read train embeddings (QUERY)
#train_embeddings = np.load('emb/train_sample_0.pkl', allow_pickle=True)
nq = 10 # query size
d = train_embeddings.shape[1]
print(nq,d)
xq = np.array(train_embeddings, dtype='float32')
print('Number of train passages: %d' % nq)



index = faiss.IndexFlatL2(d)   # build the index

print('trained? %r' % index.is_trained)
index.add(xb)                  # add vectors to the index
print('Total number of indexed CC passages: ', index.ntotal)
print()
print('Using an indentical CC set')
k = nb                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print('================================================================')
print('4 nearest neighbors')
print(I)
print()
print('distances(sanity check)')
print(D)
print()


print('===============================================================')
print('Using the query(train set)')
D, I = index.search(xq, k)     # actual search
print('4 nearest neighbors')
print(I[:5])                   # neighbors of the 5 first queries
print('\ndistances')
print(D)
print()



54 26613
Number of CC passages: 54
10 26613
Number of train passages: 10
trained? True
Total number of indexed CC passages:  54

Using an indentical CC set
4 nearest neighbors
[[ 0 12  4  6  2 40  5 10 23  3  9 35 24 43 31 18 49 36 14 30 22 28 38 13
   7 32 53 46 34 21 15  1 41 19 26 48 52 16 17 42 39 33 11 27 20 25  8 37
  45 50 47 51 29 44]
 [ 1  2 48  3 39 14  5 46  9 31 16 19 28 40  0 41 32 36 35 20  4  6 22 50
  27 37 53  7 24 33 23 26 47 15  8 25 30 18 34 17 38 29 21 11 45 12 51 52
  43 13 49 10 42 44]
 [ 2 41  1  0 48 39  3 53  5 49 46 12 22  4 33 28 14 32 23 18 45 36 19  9
   6 24 10 35 21 15 17 11 31 27 34 38 20 26 13 16 52 44 40 42 51 43 37 47
  50  7 25 29  8 30]
 [ 3 40 44 39 41 10  9  1  2 14 23  5 31 37  0 36  4 28 19 17 35 32 33 20
  22 42 27 13 26  7 53 24 30 12 46 18 43 16 48 11 15 52 25 45 47 38  8 34
  49 51 21  6 50 29]
 [ 4  0 40 23 11 10  3  9  2 41 13 32 36 15 12  5 19 16 24 43 26 35 31  1
  28 18 20 17 49 34 27 38 14 33 47 39  8 46 48 53 37 52 45 21 51 50  6 29


In [26]:
import textwrap
print(I.shape)
print(train_psgs[0])
print()
print('CLOSEST passages in CC:')

for i in range(4):
    print('-------------------------------------------------------------')
    print('Closest %d' % i)
    closest = I[0][i]
    print(textwrap.fill(cc_psgs[closest],80))

print('------------------------------------------------------------')
print('...')

for i in range(MAX_CC_PSGS-4, MAX_CC_PSGS):
    print('-------------------------------------------------------------')
    print('Farthest %d' % i)
    farthest = I[0][i]
    print(textwrap.fill(cc_psgs[farthest],80))


(1688, 54)
Thus , over the past few years , along with advances in the use of learning and statistical methods for acquisition of full parsers ( Collins , 1997 ; Charniak , 1997a ; Charniak , 1997b ; Ratnaparkhi , 1997 ) , significant progress has been made on the use of statistical learning methods to recognize shallow parsing patterns syntactic phrases or words that participate in a syntactic relationship ( Church , 1988 ; Ramshaw and Marcus , 1995 ; Argamon et al. , 1998 ; Cardie and Pierce , 1998 ; Munoz et al. , 1999 ; Punyakanok and Roth , 2001 ; Buchholz et al. , 1999 ; Tjong Kim Sang and Buchholz , 2000 ) .

CLOSEST passages in CC:
-------------------------------------------------------------
Closest 0
words themselves . [ 9 ] These are syntactic qualities since each of these
arguments bears a direct syntactic relation to their head as much as they hold a
semantic place within the underlying argument structure . In order to extract
this kind of subcategorization and selectional