-
Notifications
You must be signed in to change notification settings - Fork 1
/
translate.py.orig
92 lines (78 loc) · 2.64 KB
/
translate.py.orig
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# from pymongo import MongoClient
from bson.objectid import ObjectId
from googletrans import Translator
from pprint import pprint
import urllib
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import re
import numpy as np
from scipy import spatial
words = []
# def trans_values(obj, translator):
# for value in obj:
# if isinstance(value, dict):
# value = trans_values(value.values(), translator)
# elif isinstance(value, list):
# value = trans_values(value, translator)
# else:
# words.append(translator.translate(str(value), dest="en").text)
#
#
# def translate(host, port, doc_id):
# translator = Translator()
#
# client = MongoClient(host, port)
# db = client['test']
# collection = db['testcol']
# document = collection.find_one({'_id': ObjectId(str('5eed22a7b7da9028b46cfd2b'))})
#
# trans_values(document['09_03_02_01'].values(), translator)
def get_html(url):
fp = urllib.request.urlopen(url)
return fp.read().decode("utf8")
<<<<<<< HEAD
def parse(link):
clean = lambda item: item.strip()
content = str(BeautifulSoup(get_html(link), "lxml").text)
filter(clean, content)
map(clean, content)
pprint(content)
docs = []
# Разделяем контент на документы
pprint(sent_tokenize(content))
[docs.append(line) for line in sent_tokenize(content)]
=======
def compare(link):
content = str(BeautifulSoup(get_html(link), "lxml").get_text()).splitlines()
prepared = []
for item in content:
if len(item):
prepared.append(item.strip())
>>>>>>> ccb6debe371990d4d1a49d4a07818213d1c265c2
pprint(prepared)
# сравниваем
dr_kw = dict()
for line in words:
ln = re.sub('\.', ' ', line.lower(), flags=re.UNICODE)
for el in re.split('\W+', ln, flags=re.UNICODE):
if len(el) > 2:
dr_kw[el] = dr_kw.get(el, 0) + 1
dr_kw2 = dict()
for line in prepared:
ln = re.sub('\.', ' ', line.lower(), flags=re.UNICODE)
for el in re.split('\W+', ln, flags=re.UNICODE):
if len(el) > 2:
dr_kw2[el] = dr_kw2.get(el, 0) + 1
aa = []
for el in set.union(set(dr_kw.keys()), set(dr_kw2.keys())):
x1 = dr_kw.get(el, 0)
x2 = dr_kw2.get(el, 0)
if (x1 > 5) | (x2 > 5):
aa.append([x1, x2])
aa = np.array(aa).T
return 1 - spatial.distance.cosine(aa[0], aa[1])
if __name__ == "__main__":
# translated = translate('168.63.61.94', 27017, '5eed22a7b7da9028b46cfd2b')
compare("http://www.python.org")