-
Notifications
You must be signed in to change notification settings - Fork 2
/
keyword_extract.py
62 lines (53 loc) · 1.82 KB
/
keyword_extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from configuration.taxonomy import taxonomy_keywords
from keywords.taxonomy_stats import mean_value, tp
from utils.hyperparameters import *
from configuration.config import *
from utils.utils import normalize
from crawling.textPreprocessor import *
from keywords.keyword_appearance import keyword_appearance
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import numpy as np
def cos_sim(w1, w2):
w1 = tp.to_TAG(w1)
w2 = tp.to_TAG(w2)
try:
cos = cosine_similarity(np.reshape(normalize(tp.w2v[w1]), (1,-1)),
np.reshape(normalize(tp.w2v[w2]), (1,-1)))[0][0]
except:
cos = -10
return cos
def is_new_keyword(w, verbose=False):
try:
w_tag = tp.to_TAG(w)
except:
return False
m = []
for k in list(taxonomy_keywords.keys()):
cos = cos_sim(w_tag, tp.to_TAG(k))
if cos == -10: continue
m.append(cos_sim(w_tag, tp.to_TAG(k)))
if verbose: print(np.mean(m))
if m == []: m = [0.0]
if np.mean(m) > mean_value:
return True
return False
if __name__ == "__main__":
with open(f"./files/{domain}.pickle", "rb") as fp:
dd = pickle.load(fp)
print(f"mean_value: {mean_value}")
new_keywords = {}
words_seen = {}
for i,body in enumerate(list(dd.values())):
body = body.lower()
body = body.split()
if body != []:
for w in body:
if w in words_seen: continue
words_seen[w] = 1
if not is_new_keyword(w) or (w in new_keywords) or (w in taxonomy_keywords): continue
new_keywords[w] = 0
print(new_keywords.keys())
with open("./files/" + 'new_keywords_' + domain + '.pickle', 'wb') as handle:
new_keywords = pickle.dump(new_keywords, handle)
keyword_appearance()