/
Makefile
40 lines (34 loc) · 2.21 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
all: out/id_text.tsv out/citingId_citedId.tsv out/id_author_title_venue_year.tsv
aanrelease2013.tar.gz:
# aanrelease2013.tar.gz is 292 MB, md5: 7d10b490e75b8a22b673173e10fbcc18
curl -s http://clair.eecs.umich.edu/aan/downloads/aanrelease2013.tar.gz >$@
aan aan/papers_text aan/release/2013/acl.txt aan/release/2013/acl-metadata.txt: | aanrelease2013.tar.gz
# extract the original tarball, which contains a single directory, aan/
tar -xzf aanrelease2013.tar.gz
# alternative to out/id_text.tsv, this cleans up the text in the same way
# but produces an output file for each input file
# find aan/papers_text -name '???-????.txt' -size +1 | sed s%aan/papers_text%out/txt% | xargs make
out/txt/%: aan/papers_text/%
@mkdir -p $(@D)
<$< tr -Cs "[:alnum:]\037" ' ' | tr -s "[:space:]" ' ' >$@
out/id_text.tsv: aan/papers_text
# flatten all non-empty papers in papers_text/ and concatenate into single file
# issues:
# - totally empty papers can cause problems later on (e.g., P02-1046.txt)
# - there are a lot of other files besides paper texts (e.g., W12-3714.body, collaboration_network.txt)
# - the content is all over the place, weird whitespace, weird characters
find $< -name '???-????.txt' -size +1 | xargs -n 1 ./print_id_text.sh >$@
out/citingId_citedId.tsv: aan/release/2013/acl.txt
# acl.txt uses ' ==> ' to separate the two IDs, which seems arbitrary
# the first item generally has a more recent year than the second, so we
# know the "left ==> right" notation means "citing ==> cited"
# otherwise it's just ASCII, so that's nice
<$< awk -F ' ==> ' '{print $$1"\t"$$2}' >$@
out/id_author_title_venue_year.tsv: aan/release/2013/acl-metadata.txt
# acl-metadata.txt provides some nice metadata, but with an atrocious hodgepodge of encodings.
# it uses html entities for some non-ASCII characters, and ISO-8859-2 for others,
# and even some html entity references that aren't html-spec compliant
# and not only does it use html entities to encode accents, it uses them to encode broken 'mojibaked' accents
# ftfy from https://github.com/LuminosoInsight/python-ftfy is brilliant.
# otherwise, the format is pretty straightforward, though some of the key-val pairs span multiple lines
<$< ./print_id_author_title_venue_year.py >$@