Skip to content

Commit

Permalink
using core, whitelist editors
Browse files Browse the repository at this point in the history
  • Loading branch information
eranroz committed Jun 12, 2015
1 parent 52da126 commit 5922972
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 109 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
heWiki Replace bot is a python script for site wide replacements in Hebrew Wikipedia.

See:
https://he.wikipedia.org/wiki/%D7%95%D7%99%D7%A7%D7%99%D7%A4%D7%93%D7%99%D7%94:%D7%91%D7%95%D7%98/%D7%91%D7%95%D7%98_%D7%94%D7%97%D7%9C%D7%A4%D7%95%D7%AA
https://he.wikipedia.org/wiki/ויקיפדיה:בוט/בוט_החלפות

The script is based on pywikipediabot.
The script is based on pywikibot core.

Install
=======
* install pywikipedia.
see: http://www.mediawiki.org/wiki/Manual:Pywikipediabot/Installation
important: be sure core and scripts directory in pywikibot are in your PYTHONPATH
* Download the latest database dump:
http://dumps.wikimedia.org/hewiki/
( hewiki-YYYYMMDD-pages-articles.xml )
* place the hewiki-ReplaceBot directory within pywikipedia directory

Use
=======
Expand Down
210 changes: 112 additions & 98 deletions hewikiReplacebot.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

"""
ReplaceRobotHe is extension of ReplaceRobot.
It is used in Hebrew Wikipedia for doing common replacements according to defintions in a wiki page
Expand All @@ -22,78 +25,83 @@
# (C) Eran Roz
# Distributed under the terms of the MIT license.
#
import sys, re, time, codecs, datetime, string
import replaceConfig
sys.path.insert(1, '..')
import wikipedia as pywikibot
import pagegenerators, replace, editarticle
import re

import pywikibot
from pywikibot import i18n
import webbrowser
import pywikibot.pagegenerators

try:
import replace
except ImportError:
# usually both scripts directory and pywikibot core should be in PYTHONPATH but if not
import os
import sys
sys.path.append(os.path.abspath(os.path.join(pywikibot.__file__, os.pardir, os.pardir, 'scripts')))
import scripts.replace as replace

import replaceConfig
NO_BOT_REGEX = re.compile(replaceConfig.nobotRgx)


class XmlDumpReplacePageGeneratorHe(replace.XmlDumpReplacePageGenerator):
def __init__(self, replaceDic, xmlFilename, xmlStart, exceptions):
self.replaceDict=replaceDic
replace.XmlDumpReplacePageGenerator.__init__(self, xmlFilename, xmlStart, replaceDic.values(), exceptions)
def __init__(self, replace_dict, xml_filename, xml_start, exceptions, site):
self.replace_dict = replace_dict
replace.XmlDumpReplacePageGenerator.__init__(self, xml_filename, xml_start, replace_dict.values(),
exceptions, site)

def isTextExcepted(self, text):
"""
This is some hack. changing the replacements HERE,
assuimg this check is called before replacement for each page
assuming this check is called before replacement for each page
"""
self.replacements=list(getReplacements(self.replaceDict,text))
return replace.XmlDumpReplacePageGenerator.isTextExcepted(self,text)

self.replacements = list(getReplacements(self.replace_dict, text))
return super(XmlDumpReplacePageGeneratorHe, self).isTextExcepted(text)


class HeWikiReplacement(replace.Replacement):
def __init__(self, old, new, exceptions=None):
super(HeWikiReplacement, self).__init__(old, new, use_regex=True, exceptions=exceptions)


class ReplaceRobotHe(replace.ReplaceRobot):
""" Robot for common replacement in Hebrew Wikipedia according to known replace page """
def __init__(self,gen, replaceDict, exceptions, editSummary):
self.replaceDict=replaceDict #replacement dictionary
self.summaryPrefix=editSummary
acceptall=True
allowoverlap=False
recursive=False
add_cat=None
sleep=None
titlefile=None
excoutfile =None

def __init__(self, gen, replace_dict, exceptions, edit_summary, site=None):
self.replaceDict = replace_dict # replacement dictionary
self.summaryPrefix = edit_summary
acceptall = True
allowoverlap = False
recursive = False
add_cat = None
sleep = None
replace.ReplaceRobot.__init__(self, gen, self.replaceDict.values(), exceptions, acceptall,
allowoverlap, recursive, add_cat, sleep, editSummary,
titlefile, excoutfile)
allowoverlap, recursive, add_cat, sleep, edit_summary,
site)

""" override regular do replacements by removing disabled replaments according to template,
""" override regular do replacements by removing disabled replacements according to template,
than the method is the same as the super, but is with specifying specific summary """
def doReplacements(self, original_text):

def apply_replacements(self, original_text, applied):
"""
Returns the text which is generated by applying all replacements to
the given text.
"""
self.replacements=list(getReplacements(self.replaceDict,original_text))
actucalReplacements=[]
last_new = original_text
new_text = original_text
exceptions = []
if "inside-tags" in self.exceptions:
exceptions += self.exceptions['inside-tags']
if "inside" in self.exceptions:
exceptions += self.exceptions['inside']
for old, new in self.replacements:
if self.sleep is not None:
time.sleep(self.sleep)
new_text = pywikibot.replaceExcept(new_text, old, new, exceptions,
allowoverlap=self.allowoverlap)
if new_text!=last_new:
actucalReplacements.append(new.strip())
last_new=new_text
self.editSummary= self.summaryPrefix + string.join(actucalReplacements,', ')
return new_text

def getReplacements(replaceDict, text):

self.replacements = list(getReplacements(self.replaceDict, original_text))
return super(ReplaceRobotHe, self).apply_replacements(original_text, applied)

def generate_summary(self, applied_replacements):
actucal_replacements = [rep.new.strip() for rep in applied_replacements]
return self.summaryPrefix + ', '.join(actucal_replacements)


def getReplacements(replace_dict, text):
"""
filters disabled replacements from dictionary
"""
disabled=re.findall(replaceConfig.nobotRgx,text)
#print "disabled replacements:" + str(disabled)
for repId,repRgx in replaceDict.iteritems():
disabled = NO_BOT_REGEX.findall(text)
for repId, repRgx in replace_dict.items():
if repId not in disabled:
yield repRgx

Expand All @@ -103,33 +111,41 @@ def fillReplementsDict():
"""
site = pywikibot.getSite()
page = pywikibot.Page(site, replaceConfig.replacementsPage)
text=page.get()
replaceDict=dict()
exceptReplace=list()
for x in re.findall("\\|([0-9]+)\n\\|<nowiki>(.*)</nowiki>\n\\|<nowiki>(.*)</nowiki>\n\\|(?:<nowiki>)?(.*?)(?:\n|</nowiki>)", text):
text = page.get()
replaceDict = dict()
if page.lastNonBotUser() not in replaceConfig.whitelist_editors:
raise Exception('Non authorized user edited the replace list. Please verify')

for x in re.findall(
"\\|([0-9]+)\n\\|<nowiki>(.*)</nowiki>\n\\|<nowiki>(.*)</nowiki>\n\\|(?:<nowiki>)?(.*?)(?:\n|</nowiki>)",
text):
try:
#compile the regex to check if it is support by python
re.compile(x[1])
replaceDict[x[0]] = (x[1],re.sub('\\$([0-9])','\\\\\\1',x[2]))
if x[3]!='':
exceptReplace.append(x[3])
# compile the regex to check if it is support by python

if x[3] == '':
replacement = HeWikiReplacement(x[1], re.sub('\\$([0-9])', '\\\\\\1', x[2]))
else:
replacement = HeWikiReplacement(x[1], re.sub('\\$([0-9])', '\\\\\\1', x[2]), {'inside': [x[3]]})
replacement.compile(use_regex=True, flags=re.UNICODE)
replaceDict[x[0]] = replacement
except:
#some regexs are written for c# and are ignored by this bot
# some regexs are written for c# and are ignored by this bot
pywikibot.output('Non supported replacement. ID: %s' % x[0])
pass
return (replaceDict, exceptReplace)
return replaceDict


def main(*args):
pywikibot.output('Starting hewiki-replacebot')
editSummary=replaceConfig.defaultSummary
xmlFilename=None
for arg in pywikibot.handleArgs(*args):
editSummary = replaceConfig.defaultSummary
xmlFilename = None
xmlStart = None
for arg in pywikibot.handle_args(*args):
if arg.startswith('-summary:'):
editSummary = arg[9:]
elif arg.startswith('-xmlstart'):
if len(arg) == 9:
xmlStart = pywikibot.input(
u'Please enter the dumped article to start with:')
xmlStart = pywikibot.input('Please enter the dumped article to start with:')
else:
xmlStart = arg[10:]
elif arg.startswith('-xml'):
Expand All @@ -138,45 +154,43 @@ def main(*args):
else:
xmlFilename = arg[5:]

if xmlFilename==None:
if xmlFilename is None:
pywikibot.output('no xml dump specified. please fill -xml and the xml file to be used')
return
replaceDict,exceptReplace=fillReplementsDict()
try:
xmlStart
except NameError:
xmlStart = None
replaceDict = fillReplementsDict()

safeTemplates=replaceConfig.safeTemplates
#add external links templates
genFactory = pagegenerators.GeneratorFactory()
safe_templates = replaceConfig.safeTemplates
# add external links templates
site = pywikibot.Site()
for safeCategory in replaceConfig.safeTemplatesCategories:
citeTemplates=genFactory.getCategoryGen(safeCategory,-1, True)
citeTemplates=[page.title(withNamespace=False) for page in citeTemplates]
safeTemplates+=citeTemplates
fileUsageRgx=re.compile(replaceConfig.fileUsageRgx,re.I)
yiRgx=re.compile('\[\[yi:.*?\]\]')
safeTemplatesRgx=re.compile(u'\{\{('+string.join(safeTemplates,u'|')+').*?\}\}',re.I)
cite_templates = pywikibot.Category(site, safeCategory).articles(namespaces=10, recurse=True)
cite_templates = [page.title(withNamespace=False) for page in cite_templates]
safe_templates += cite_templates

file_usage_rgx = re.compile(replaceConfig.fileUsageRgx, re.I)
yiRgx = re.compile('\[\[yi:.*?\]\]')
safeTemplatesRgx = re.compile('\{\{(' + '|'.join(safe_templates, ) + ').*?\}\}', re.I)
exceptions = {
'title': [],
'text-contains': [re.compile(replaceConfig.redirectRgx,re.I)],
'inside': [fileUsageRgx,safeTemplatesRgx, re.compile(u'('+string.join(exceptReplace,u'|')+')'),yiRgx],
'inside-tags': ['nowiki','math','comment','pre','source','hyperlink','gallery'],
'title': [],
'text-contains': [re.compile(replaceConfig.redirectRgx, re.I)],
'inside': [file_usage_rgx, safeTemplatesRgx, yiRgx],
'inside-tags': ['nowiki', 'math', 'comment', 'pre', 'source', 'hyperlink', 'gallery'],
'require-title': [],
}
gen = XmlDumpReplacePageGeneratorHe(replaceDict, xmlFilename, xmlStart, exceptions)
genFactory.namespaces=replaceConfig.namespaces
#For debugging pupose, uncomment it to work on specific page
#pages = [pywikibot.Page(pywikibot.getSite(), PageTitle)
# for PageTitle in [u'PAGENAME']]
#gen = iter(pages)
#end of specific page
maxquerysize=60
gen = genFactory.getCombinedGenerator(gen)
gen = pagegenerators.PreloadingGenerator(gen,pageNumber=maxquerysize)

# avoid searching in other namespaces in the xml
exceptions_with_title_ns = dict(exceptions)
exceptions_with_title_ns['title'] = [re.compile('^'+re.escape(ns_name)+':') for ns_index, ns
in site.namespaces.items() if ns_index not in replaceConfig.namespaces
for ns_name in ns]
gen = XmlDumpReplacePageGeneratorHe(replaceDict, xmlFilename, xmlStart, exceptions_with_title_ns, site)
gen_factory = pywikibot.pagegenerators.GeneratorFactory()
gen = gen_factory.getCombinedGenerator(gen)
gen = pywikibot.pagegenerators.NamespaceFilterPageGenerator(gen, replaceConfig.namespaces, site)
gen = pywikibot.pagegenerators.PreloadingGenerator(gen)
pywikibot.output('starting replace')
bot=ReplaceRobotHe(gen,replaceDict,exceptions,editSummary)
bot = ReplaceRobotHe(gen, replaceDict, exceptions, editSummary, site=site)
site.login()
bot.run()
pywikibot.output('finished all replacements')

Expand Down
31 changes: 23 additions & 8 deletions replaceConfig.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
# -*- coding: utf-8 -*-
replacementsPage=u'ויקיפדיה:בוט/בוט החלפות/רשימת החלפות נוכחית'
defaultSummary=u'בוט החלפות: '
safeTemplates=[u'ציטוט',u'ציטוטון',u'חלונית',u'מסגרת',u'הדגשה',u'קמץ קטן']
safeTemplatesCategories=[u'תבניות קישורים חיצוניים', u'תבניות ציטוט']
nobotRgx=u"\{\{ללא בוט\|([0-9]+)\}\}"
fileUsageRgx=u'\[\[(File|Image|תמונה|קובץ)\s*:\s*.*?[\|\]]'
redirectRgx=u'#\s*(הפניה|REDIRECT)\s*\[\['
namespaces=[0,10,14,100]
from __future__ import unicode_literals

"""
Configuration file for hewikiReplaceBot
-replacementsPage list of replacements maintained by the community
-defaultSummary prefix for summary.
-safeTemplates templates that the bot most ignore
-safeTemplatesCategories list of categories of templates the bot most ignore
-nobotRgx regex for extracting replacements ids the bot shouldn't execute in the specific page
-fileUsageRgx Regex for file usage
-redirectRgx Regex to identify redirects
-namespaces List of namespace the bot is allowed to work on
-whitelist_editors List of editors allowed to edit the replacementsPage
"""
replacementsPage = 'ויקיפדיה:בוט/בוט החלפות/רשימת החלפות נוכחית'
defaultSummary = '[[וק:הח|בוט החלפות]]: '
safeTemplates = ['ציטוט', 'ציטוטון', 'חלונית', 'מסגרת', 'הדגשה', 'קמץ קטן']
safeTemplatesCategories = ['תבניות קישורים חיצוניים', 'תבניות ציטוט']
nobotRgx = "\{\{ללא בוט\|([0-9]+)\}\}"
fileUsageRgx = '\[\[(File|Image|תמונה|קובץ)\s*:\s*.*?[\|\]]'
redirectRgx = '#\s*(הפניה|REDIRECT)\s*\[\['
namespaces = [0, 10, 14, 100]
whitelist_editors = ['ערן', 'Matanya']

0 comments on commit 5922972

Please sign in to comment.