-
Notifications
You must be signed in to change notification settings - Fork 1
/
article.py
97 lines (83 loc) · 3.16 KB
/
article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import concurrent.futures as futures # for multithreading
import requests
from utils import articleQa, articleParser
from utils.articleParser import clean
import db
import time
from config import config
def good(val):
return val and len(val) > 0
def fixUrl(url):
if url.startswith('//'):
url = 'http:' + url
return url
class Article(object):
def __init__(self, guid='', title='', url='', html='', timestamp=None, source='', feed='', content=''):
self.guid = guid
self.title = clean(title)
self.url = fixUrl(clean(url))
self.timestamp = timestamp
self.source = clean(source)
self.feed = clean(feed)
self.content = clean(content)
self.keywords = []
self.html = clean(html)
def isDuplicate(self):
return articleQa.isDuplicate(self)
def downloadArticle(self):
try:
response = requests.get(self.url, timeout=5)
except Exception as e:
print 'Could not download the article: %s' % self.url
print e
return False
self.url = response.url # Could have changed during redirects.
self.html = response.text
return True
def parseArticle(self):
articleParser.parseArticle(self)
def isValid(self):
"""Check if the article has enough data to be considered "crawled"."""
if not good(self.guid):
return False
if not good(self.title):
return False
if not good(self.url):
return False
if self.timestamp is None:
return False
if not good(self.source):
return False
if not good(self.feed):
return False
if not good(self.content):
return False
return True
def save(self):
"""Save the article to the database.
This method will print out an error if the article is not valid.
Return True if this is an original article and False if it's a duplicate.
"""
dupID = articleQa.isDuplicate(self)
if not self.isValid():
print("Article from source: " + self.source + "feed: " + self.feed + " was invalid")
elif dupID is not None: # we just update the content because this is a duplicate of something
db.updateArticle(dupID, self)
else: # Write full on article
db.insertArticle(self.guid, self)
return dupID is not None
def _parse(article):
startTime = time.time()
article.parseArticle()
print "[PARSE " + str(round(time.time() - startTime, 3)) + "s] " + article.url
# No reason to keep this after parsing.
# And deleting it here makes interprocess-pickling much faster.
article.html = None
return article
def parseArticles(articles, maxWorkers=config['parseWorkers']):
print '\nParsing %d article(s) with %d process(es).' % (len(articles), maxWorkers)
with futures.ProcessPoolExecutor(max_workers=maxWorkers) as executor:
articlesFutures = executor.map(_parse, articles)
# Force the futures generator to give us all of the articles back.
articles = [article for article in articlesFutures]
return articles