-
Notifications
You must be signed in to change notification settings - Fork 9
/
TrecIndexing.py
51 lines (40 loc) · 1.46 KB
/
TrecIndexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from elasticsearch import Elasticsearch
import pandas as pd
import os
class TrecIndexing:
def __init__(self):
self.es = Elasticsearch([{'host':'localhost','port':9200}])
def getDocument(self,pmcid):
res = self.coll.find({"articleMeta.pmcid" : str(pmcid)}).next()
meta = res['articleMeta']
content = res['articleContent']
# title
title = meta['title']
# abstract
abstract = ""
if 'sectionList' in meta['abstractText']:
for entry in meta['abstractText']['sectionList']:
if 'paragraphs' in entry:
abstract = abstract + '\n' + entry['paragraphs']
# body
body = ""
for entry in content['sectionList']:
if 'paragraphs' in entry:
body = body + '\n' + entry['paragraphs']
return (title,abstract,body)
def doIndex(self):
for d in os.listdir('data'):
cnt = os.listdir('data/'+d)
i = 0
for f in os.listdir('data/'+d):
document = open('data/'+d+'/'+f).read()
pmcid = f.split('.')[0]
docin = {'pmcid':pmcid,
'document' : document
}
res = self.es.index(index='bm25',doc_type='article',id=pmcid,body=docin)
print res['created'],i,'/',cnt
i += 1
break
t = TrecIndexing()
t.doIndex()