-
Notifications
You must be signed in to change notification settings - Fork 1
/
impTermExtraction.py
84 lines (65 loc) · 2.84 KB
/
impTermExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
Course: Search Engine Architecture
Project: Cluster-Labeling
File: candidateLabelExtraction.py
Job: Extracting representative labels of clusters
'''
import pickle, math, numpy as np, os
from collections import Counter
def naive_weighing(options, X, cluster, indexes):
term_cluster_weights = {}
term_collection_weights = {}
labels = []
if hasattr(cluster, 'labels_'):
for i in range(X.shape[0]):
labels.append([cluster.labels_[i]])
else:
for i in range(X.shape[0]):
labels.append(np.array(cluster.predict(X[i].toarray())))
cluster_counts = Counter([item for sublist in labels for item in sublist])
print(cluster_counts)
for term in indexes['idf'].keys():
term_collection_weights[term] = {}
idf = indexes['idf'][term]
tf = sum([indexes['tf'][term][t] for t in indexes['tf'][term].keys()])
term_cluster_weights[term] = {}
for cluster_id in range(options.num_clusters):
tf_cluster = [indexes['tf'][term][t] for t in indexes['tf'][term].keys() if cluster_id in labels[t]]
ctf = sum(tf_cluster) / float(cluster_counts[cluster_id])
cdf = math.log(1 + sum([1 for i in tf_cluster]))
term_cluster_weights[term][cluster_id] = cdf * ctf * idf
term_collection_weights[term] = sum([term_cluster_weights[term][t] for t in term_cluster_weights[term].keys()])
if options.save_intermediate:
pickle.dump({'weighed_terms': term_cluster_weights, 'collection_weights': term_collection_weights},
open(os.path.join(options.intermediate_out_directory, 'term_weights_naive.pkl'), 'wb'))
return {'weighed_terms': term_cluster_weights, 'collection_weights': term_collection_weights}
def JSD(options, X, cluster, indexes):
naive_weights = naive_weighing(options, X, cluster, indexes)
cluster_weights = naive_weights['weighed_terms']
collection_weights = naive_weights['collection_weights']
JSD = {}
for term in cluster_weights.keys():
JSD[term] = {}
for cluster_id in range(options.num_clusters):
P = cluster_weights[term][cluster_id]
Q = collection_weights[term]
M = 0.5 * (P + Q)
if M > 0:
D_p_m = P * math.log(1 + (P / M))
D_q_m = Q * math.log(1 + (P / M))
JSD[term][cluster_id] = 0.5 * (D_p_m + D_q_m)
else:
JSD[term][cluster_id] = 0
if options.save_intermediate:
pickle.dump({'weighed_terms': JSD},
open(os.path.join(options.intermediate_out_directory, 'term_weights_JSD.pkl'), 'wb'))
return {'weighed_terms': JSD}
def get_important_terms(options, weighed_terms):
important_words = [None] * options.num_clusters
for i in range(options.num_clusters):
weights = [(term, weighed_terms[term][i]) for term in weighed_terms.keys()]
weights.sort(key=lambda x: x[1], reverse=True)
important_words[i] = weights[:options.num_important_words]
if options.save_intermediate:
pickle.dump(important_words, open(os.path.join(options.intermediate_out_directory, 'important_words.pkl'), 'wb'))
return important_words