-
Notifications
You must be signed in to change notification settings - Fork 1
/
item.py
100 lines (83 loc) · 3.04 KB
/
item.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import arrow
import bleach
import hashlib
from datetime import datetime, timedelta
class Item(object):
def __init__(self, item):
self.item = item
self.created = arrow.utcnow()
def __eq__(self, other):
return self.fingerprint == other.fingerprint
def __ne__(self, other):
return self.fingerprint != other.fingerprint
def __hash__(self):
return hash(self.fingerprint)
def clean_text(self, text, limit=280, suffix=u'\u2026'):
cleaned = bleach.clean(text, tags=[], strip=True).strip()
if len(cleaned) > limit:
s = u''.join(cleaned[:limit]).strip()
idx = s.rfind(' ')
return s[:idx] + suffix
else:
return cleaned
@property
def info(self):
obj = {
'timestamp': str(self.timestamp or arrow.Arrow(1970, 1, 1)),
'guid': self.item.get('guid', ''),
}
if self.item.get('title') and self.item.get('description'):
obj['title'] = self.clean_text(self.item.get('title'))
obj['body'] = self.clean_text(self.item.get('description'))
if obj['title'] == obj['body']:
obj['body'] = ''
elif not self.item.get('title') and self.item.get('description'):
obj['title'] = self.clean_text(self.item.get('description'))
obj['body'] = ''
elif self.item.get('title'):
obj['title'] = self.clean_text(self.item.get('title'))
obj['body'] = ''
if self.item.get('link'):
obj['link'] = self.item.get('link')
if self.item.get('comments'):
obj['comments'] = self.item.get('comments')
return obj
@property
def delay(self):
"""
Return a timedelta representing the delay between when the item
appeared in the feed and when it was first seen.
"""
if self.timestamp is not None:
return self.created - self.timestamp
else:
return timedelta(seconds=0)
@property
def timestamp_provided(self):
"""
Return True if there was a provided timestamp for the item.
"""
return self.timestamp != self.created
@property
def timestamp(self):
for key in ['published_parsed', 'updated_parsed', 'created_parsed']:
if not self.item.get(key): continue
val = (self.item[key])[:6]
reported_timestamp = arrow.get(datetime(*val))
if arrow.Arrow(2000, 1, 1) > reported_timestamp:
# If pre-2000, consider it bogus
return None
elif reported_timestamp < self.created:
return reported_timestamp
return self.created
@property
def fingerprint(self):
if self.item.get('guid'):
return self.item.get('guid')
else:
s = ''.join([
self.item.get('title', ''),
self.item.get('link', ''),
])
s = s.encode('utf-8', 'ignore')
return hashlib.sha1(s).hexdigest()