-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractdblppapers.py
84 lines (68 loc) · 2.27 KB
/
extractdblppapers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import xml.sax
import msgpack
elems = set(["title", "author", "year", "booktitle", "journal", "crossref"])
class ABContentHandler(xml.sax.ContentHandler):
def __init__(self):
xml.sax.ContentHandler.__init__(self)
self.depth = 0
self.structure = None
self.tag = None
self.all = []
self.count = 0
self.content = ""
def startElement(self, name, attrs):
self.depth += 1
if self.depth == 2:
# Start a new strucure
self.structure = {}
try:
if name == "article":
if attrs.getValue("publtype"):
self.structure = None
except:
pass
elif self.depth == 3:
if self.structure is not None and name in elems:
if name not in self.structure:
self.structure[name] = []
self.tag = name
# print("startElement '" + name + "', " + str(self.depth))
def endElement(self, name):
# print("endElement '" + name + "'")
self.depth -= 1
if self.tag:
if self.content != "":
self.structure[self.tag].append( self.content )
self.content = ""
self.tag = None
if self.depth == 1:
if self.structure is not None and "author" in self.structure and "year" in self.structure:
if 2015 > int(self.structure["year"][0]) >= 2008:
# print self.structure
# Massage a bit the structure
B = None
if "booktitle" in self.structure:
B = self.structure["booktitle"][0]
elif "journal" in self.structure:
B = self.structure["journal"][0]
# (Authors, title, booktitle, year)
rec = (self.structure["author"], \
self.structure["title"][0], \
B, int(self.structure["year"][0]))
#if len(rec[0]) > 1:
# print rec
# print
self.all += [rec]
self.count += 1
if self.count % 10000 == 0:
print self.count
def characters(self, content):
# print("characters '" + content + "'")
if self.tag:
self.content += content
if __name__ == "__main__":
source = open("data/dblp.xml")
data = ABContentHandler()
xml.sax.parse(source, data)
packed_data = msgpack.packb(data.all, use_bin_type=True)
file("data/allfiles.dat", "wb").write(packed_data)