/
pipeline.py
211 lines (187 loc) · 9.78 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/usr/bin/python
from utils import sub_lists
import filtering
import ranking
import similarity
import simplification
from mocs_config import GRAPHVIZ_PARAMS
from subprocess import Popen, PIPE
from re import sub, search
from collections import Counter
from utils import flatten, hashable
from chunking import STOP_WORDS
from nltk.tokenize import word_tokenize
from status import set_status
debug = False
USE_SFDP_FOR_LAYOUT = False
# a regular expression to extract width and height from the svg, and then
# eliminate these attributes
SVG_DIMENSION_REPLACEMENT = ('<svg width="(.*)pt" height="(.*)pt"', '<svg')
class TermExtraction:
'''hacky enum for use by extract_terms'''
def phrases(document):
return document.terms_list()
def all_words(document):
return [[word] for word in word_tokenize(document.title.lower())
if word not in STOP_WORDS]
def words_from_phrases(document):
return [[word] for word in flatten(document.terms_list())
if word not in STOP_WORDS]
Phrases, AllWords, WordsFromPhrases = range(3)
names = ['phrases', 'all_words', 'words_from_phrases']
functions = [phrases, all_words, words_from_phrases]
def extract_terms(documents, term_type):
return map(TermExtraction.functions[term_type], documents)
def calculate_heatmap_values(heatmap_terms, graph_terms, model=None):
"""returns a dictionary of term -> intensity values for the
terms in documents in heatmap_terms (which should be an iterable
of term tuples) that are also in the set
graph_terms"""
term_counts = Counter()
for term in heatmap_terms:
if (not graph_terms) or (hashable(term) in graph_terms):
term_counts[hashable(term)] += 1
# term_counts = Counter(term for term in heatmap_terms if hashable(term) in graph_terms)
return term_counts
ranking_fns = [ranking.tfidf, ranking.cnc_bigrams, ranking.cnc_unigrams, ranking.tf]
ranking_fn_names = ['TF/ICF', 'C-Value', 'C-Value with Unigrams', 'Term Frequency']
def call_rank(ranking_index, flattened, n_large, start_words=[], model=None):
"""ranking_index: 0 = TFIDF; 1 = C-value; 2 = C-value + Unigrams; 3 = TF"""
ranking_fn = ranking_fns[ranking_index]
ranking_fn_name = ranking_fn_names[ranking_index]
set_status('ranking with %s' % ranking_fn_name, model=model)
if debug:
print 'ranking with %s' % ranking_fn_name
scored_phrases, phrase_frequencies = ranking_fn(flattened)
set_status('ordering', model=model)
if debug:
print 'ordering'
ordered_phrases = sorted(scored_phrases.iteritems(),
key=lambda p: p[1], reverse=True)
# ordered_fname ='../phrase_lists/%s.phrases' % ranking_index
# print 'writing ordered phrases to file %s' % ordered_fname
# with open(ordered_fname, 'w') as f:
# for o in ordered_phrases[:n_large]:
# f.write('%s\n' % str(o))
if debug:
print 'mapping'
ranked_phrases = [p[0] for p in ordered_phrases]
if debug:
print 'trimming large'
large_phrases = ranked_phrases[:n_large]
if start_words:
if debug:
print 'looking for start words', start_words
found_start_words = []
for start_word in start_words:
matches = (ranked_phrase for ranked_phrase in ranked_phrases if start_word in sub_lists(ranked_phrase, proper=False))
try:
word = matches.next()
if word not in large_phrases:
found_start_words.append(word)
except StopIteration:
if debug:
print 'start word %s not found' % start_word
if debug:
print 'found start words', found_start_words
top_phrases = found_start_words + large_phrases
else:
top_phrases = large_phrases
filtered_frequencies = dict((phrase, freq) for (phrase, freq) in phrase_frequencies.items() if phrase in top_phrases)
return top_phrases, filtered_frequencies, scored_phrases
call_rank.functions = ranking_fns
call_rank.default = ranking_fns.index(ranking.cnc_bigrams)
similarity_fns = [similarity.lsa, similarity.jaccard_full, similarity.jaccard_partial, similarity.distributional_js]
similarity_fn_names = ['LSA', 'Jaccard Coefficient', 'Partial Match Jaccard Coefficient', 'Distributional JS']
def call_similarity(similarity_index, structured_nps, phrases, model=None, status_callback=None):
"""
similarity_index: 0 = LSA (w/ Cosine similarity); 1 = Jaccard; 2 = Jaccard (partial match); 3 = Distributional similarity (w/ Jensen-Shannon divergence)
"""
# similarity_fns = [similarity.lsa, similarity.jaccard_full, similarity.jaccard_partial]
similarity_fn = similarity_fns[similarity_index]
set_status('calculating similarity with %s' % similarity_fn, model=model)
sim_matrix, phrases = similarity_fn(structured_nps, phrases, status_callback=status_callback)
# with open('/tmp/sim.pickle', 'w') as f:
# pickle.dump(sim_matrix, f)
return sim_matrix, phrases
call_similarity.functions = similarity_fns
call_similarity.default = similarity_fns.index(similarity.jaccard_partial)
filtering_fns = [filtering.top, filtering.pull_lesser, filtering.hybrid]
filtering_fn_names = ['Top Terms Only', 'Pull Lesser Terms', 'Hybrid']
def call_filter(filter_index, sim_matrix, phrases, top_limit_override=None, model=None):
"""
filter_index: 0 = Top; 1 = Pull in Lesser Terms; 2 = Take Top and Fill w/ Lesser
"""
filtering_fn = filtering_fns[filter_index]
set_status('filtering and getting pairwise with %s' % filtering_fn, model=model)
if top_limit_override:
phrase_pairs = filtering_fn(sim_matrix, phrases, top_limit=top_limit_override)
else:
phrase_pairs = filtering_fn(sim_matrix, phrases)
return phrase_pairs
call_filter.functions = filtering_fns
call_filter.default = filtering_fns.index(filtering.pull_lesser)
def function_help(calling_function):
"""can be called on the call_* functions to get a list of the different algorithms they can use"""
return '\n'.join([str(index) + ':' + str(fn)
for index, fn in enumerate(calling_function.functions)])
def graphviz_command(sfdp='sfdp', gvmap='gvmap', gvpr='gvpr', labels_path='map/viz/labels.gvpr', neato='neato', file_format='svg'):
return "%s -c -f %s | %s -Goverlap=prism -Goutputorder=edgesfirst -Gsize=60,60! | %s -e -s -4 | %s -Gforcelabels=false -Ecolor=grey -Gsize=60,60! -n2 -T%s" % (gvpr, labels_path, sfdp if USE_SFDP_FOR_LAYOUT else neato, gvmap, neato, file_format)
def strip_dimensions(svg):
"""having width and height attributes as well as a viewbox will cause openlayers to not display the svg propery, so we strip those attributes out"""
match_re, replacement = SVG_DIMENSION_REPLACEMENT
try:
width, height = map(float, search(match_re, svg).groups())
except Exception:
width, height = 0.0, 0.0
return sub(match_re, replacement, svg, count=1), width, height
def map_representation(structured_nps, start_words=None, ranking_algorithm=1,
similarity_algorithm=2, filtering_algorithm=1,
number_of_terms=1000, simplify_terms=False, model=None,
data_dump_path=None):
"""returns a pair similarity dictionary for the map and set of terms in the map. Heatmap can
be calculated seperately and then overlaid. Will need to convert dictionary representation
to dot file format"""
flattened = flatten(structured_nps)
set_status('ranking terms', model=model)
if start_words is not None:
# start words should be a list like ["machine learning", "artificial intelligence"]
start_words = [tuple(s.split()) for s in start_words]
ranked_phrases, phrase_frequencies, scored_phrases = call_rank(ranking_algorithm, flattened, number_of_terms, start_words=start_words, model=model)
else:
ranked_phrases, phrase_frequencies, scored_phrases = call_rank(ranking_algorithm, flattened, number_of_terms, model=model)
if simplify_terms:
structured_nps = simplification.term_replacement(structured_nps, ranked_phrases)
set_status('calculating similarity', model=model)
sim_matrix, phrase_lookups = call_similarity(similarity_algorithm, structured_nps, ranked_phrases, model=model, status_callback=lambda s: set_status(s, model=model))
if data_dump_path:
import pickle
from os.path import join
def prefix_path(rel):
return join(data_dump_path, rel)
with open(prefix_path('sim_matrix.pickle'), 'w') as f:
pickle.dump(sim_matrix, f)
with open(prefix_path('phrase_lookups.pickle'), 'w') as f:
pickle.dump(phrase_lookups, f)
with open(prefix_path('phrase_frequencies.pickle'), 'w') as f:
pickle.dump(phrase_frequencies, f)
phrase_pairs = call_filter(filtering_algorithm, sim_matrix, phrase_lookups, model=model)
normed = similarity.similarity_dict_to_distance(phrase_pairs)
# build set of terms in graph
graph_terms = set()
for term, lst in normed.items():
graph_terms.add(term)
graph_terms.update(term for term, val in lst)
return normed, graph_terms, phrase_frequencies, phrase_pairs, scored_phrases
def call_graphviz(map_string, file_format='svg', model=None):
"""map_string should be a string in the dot file format, which the pipeline will be called on. Output in format file_format"""
set_status('drawing graph', model=model)
gv_command = graphviz_command(file_format=file_format, **GRAPHVIZ_PARAMS)
proc = Popen('echo $PATH', stdout=PIPE, shell=True)
print "path:", proc.communicate(input='')[0]
proc = Popen(gv_command, stdout=PIPE, stdin=PIPE, shell=True)
map_out, map_err = proc.communicate(input=map_string)
print "return code:", proc.returncode
if map_err:
print map_err
return map_out