/
checker.py
211 lines (190 loc) · 8.07 KB
/
checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# -*- coding: utf-8 -*-
from datetime import datetime
from hashlib import sha256
from urlparse import urlparse
from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
from .misc import Query, get_db
from .sites import get_site
from .turnitin import search_turnitin
__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
T_POSSIBLE = 0.4
T_SUSPECT = 0.75
def _coerce_bool(val):
return val and val not in ("0", "false")
def do_check(query=None):
if not query:
query = Query()
if query.lang:
query.lang = query.orig_lang = query.lang.lower()
if "::" in query.lang:
query.lang, query.name = query.lang.split("::", 1)
if query.project:
query.project = query.project.lower()
query.submitted = query.project and query.lang and (query.title or query.oldid)
if query.submitted:
query.site = get_site(query)
if query.site:
_get_results(query, follow=not _coerce_bool(query.noredirect))
return query
def _get_results(query, follow=True):
if query.oldid:
page = query.page = _get_page_by_revid(query.site, query.oldid)
if not page:
return
else:
page = query.page = query.site.get_page(query.title)
try:
page.get() # Make sure that the page exists before we check it!
except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
return
if page.is_redirect and follow:
try:
query.title = page.get_redirect_target()
except exceptions.RedirectError:
pass # Something's wrong. Continue checking the original page.
else:
query.redirected_from = page
return _get_results(query, follow=False)
if not query.action:
query.action = "compare" if query.url else "search"
if query.action == "search":
conn = get_db()
use_engine = 0 if query.use_engine in ("0", "false") else 1
use_links = 0 if query.use_links in ("0", "false") else 1
use_turnitin = 1 if query.turnitin in ("1", "true") else 0
if not use_engine and not use_links and not use_turnitin:
query.error = "no search method"
return
# Handle the turnitin check
if use_turnitin:
query.turnitin_result = search_turnitin(page.title, query.lang)
# Handle the copyvio check
mode = "{0}:{1}:".format(use_engine, use_links)
if not _coerce_bool(query.nocache):
query.result = _get_cached_results(
page, conn, mode, _coerce_bool(query.noskip))
if not query.result:
try:
query.result = page.copyvio_check(
min_confidence=T_SUSPECT, max_queries=8, max_time=45,
no_searches=not use_engine, no_links=not use_links,
short_circuit=not query.noskip)
except exceptions.SearchQueryError as exc:
query.error = "search error"
query.exception = exc
return
query.result.cached = False
_cache_result(page, query.result, conn, mode)
elif query.action == "compare":
if not query.url:
query.error = "no URL"
return
scheme = urlparse(query.url).scheme
if not scheme and query.url[0] not in ":/":
query.url = "http://" + query.url
elif scheme not in ["http", "https"]:
query.error = "bad URI"
return
result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
max_time=30)
if result.best.chains[0] is EMPTY:
query.error = "timeout" if result.time > 30 else "no data"
return
query.result = result
query.result.cached = False
else:
query.error = "bad action"
def _get_page_by_revid(site, revid):
res = site.api_query(action="query", prop="info|revisions", revids=revid,
rvprop="content|timestamp", inprop="protection|url")
try:
page_data = res["query"]["pages"].values()[0]
title = page_data["title"]
page_data["revisions"][0]["*"] # Only need to check that these exist
page_data["revisions"][0]["timestamp"]
except KeyError:
return
page = site.get_page(title)
# EarwigBot doesn't understand old revisions of pages, so we use a somewhat
# dirty hack to make this work:
page._load_attributes(res)
page._load_content(res)
return page
def _get_cached_results(page, conn, mode, noskip):
query1 = """DELETE FROM cache
WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
query2 = """SELECT cache_time, cache_queries, cache_process_time,
cache_possible_miss
FROM cache
WHERE cache_id = ?"""
query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
FROM cache_data
WHERE cdata_cache_id = ?"""
cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
with conn.cursor() as cursor:
cursor.execute(query1)
cursor.execute(query2, (cache_id,))
results = cursor.fetchall()
if not results:
return None
cache_time, queries, check_time, possible_miss = results[0]
if possible_miss and noskip:
return None
cursor.execute(query3, (cache_id,))
data = cursor.fetchall()
if not data: # TODO: do something less hacky for this edge case
article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
result = CopyvioCheckResult(False, [], queries, check_time,
article_chain, possible_miss)
result.cached = True
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = _format_date(cache_time)
return result
url, confidence, skipped, excluded = data.pop(0)
if skipped: # Should be impossible: data must be bad; run a new check
return None
result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
if abs(result.confidence - confidence) >= 0.0001:
return None
for url, confidence, skipped, excluded in data:
if noskip and skipped:
return None
source = CopyvioSource(None, url)
source.confidence = confidence
source.skipped = bool(skipped)
source.excluded = bool(excluded)
result.sources.append(source)
result.queries = queries
result.time = check_time
result.possible_miss = possible_miss
result.cached = True
result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
result.cache_age = _format_date(cache_time)
return result
def _format_date(cache_time):
formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
diff = datetime.utcnow() - cache_time
total_seconds = diff.days * 86400 + diff.seconds
if total_seconds > 3600:
return formatter(total_seconds / 3600, "hour")
if total_seconds > 60:
return formatter(total_seconds / 60, "minute")
return formatter(total_seconds, "second")
def _cache_result(page, result, conn, mode):
query1 = "DELETE FROM cache WHERE cache_id = ?"
query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?, ?)"
query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?, ?)"
cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
source.excluded)
for source in result.sources]
with conn.cursor() as cursor:
cursor.execute("START TRANSACTION")
cursor.execute(query1, (cache_id,))
cursor.execute(query2, (cache_id, result.queries, result.time,
result.possible_miss))
cursor.executemany(query3, data)
cursor.execute("COMMIT")