-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler.py
60 lines (52 loc) · 2.09 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from datetime import datetime
from feed import Feed, downloadFeeds, parseFeeds, downloadArticlesInFeeds
from utils import ip
from article import Article, parseArticles
import db, time
from config import config
startTime = datetime.utcnow()
match = {'$match': {'active': True}}
project = {'$project': {'milliSecondsUntilRedo': {'$subtract': [{'$subtract': [startTime, '$lastCrawl']}, {'$multiply': [1000, '$crawlFreq']}]}, 'feed': 1, 'stamp': 1, 'lastCrawl': 1, 'active': 1}} # Substracting datetime and a number of seconds doesn't work
match2 = {'$match': {'milliSecondsUntilRedo': {'$gte': 0}}} # only get the ones that must be redone
sort = {'$sort': {'milliSecondsUntilRedo': -1}} # most important ones first
limit = {'$limit': config['feedsNum']} # we only get the 150 most pressing sources :)
feedList = db.aggregateFeeds([match, project, match2, sort, limit])
newArticlesCount = 0
duplicateArticlesCount = 0
validArticlesCount = 0
feedsCount = 0
i = 0
newArticles = []
batchSize = config['batchSize']
while i < len(feedList):
tempList = feedList[i:(i + batchSize)]
feeds = [Feed(url=feed['feed'], stamp=feed.get('stamp', None)) for feed in tempList]
feeds = downloadFeeds(feeds)
feeds = parseFeeds(feeds)
feeds = downloadArticlesInFeeds(feeds)
newArticles = []
for feed in feeds:
newArticles.extend(feed.articles)
newArticles = parseArticles(newArticles)
validArticles = [article for article in newArticles if article.isValid()]
duplicateArticlesC = [article.save() for article in validArticles].count(True)
for feed in feeds:
print '%s => +%d' % (feed.url, len(feed.articles))
feed.save()
i += batchSize
newArticlesCount += len(newArticles)
duplicateArticlesCount += duplicateArticlesC
validArticlesCount += len(validArticles)
feedsCount += len(feeds)
endTime = datetime.utcnow()
runTime = round((endTime - startTime).total_seconds(), 2)
db.log({
'startTime': startTime,
'runTime': runTime,
'ip': ip.get_ip_address(),
'feeds': feedsCount,
'newArticles': newArticlesCount,
'duplicateArticles': duplicateArticlesCount,
'validArticles': validArticlesCount
})
print("--- %s seconds ---" % runTime)