-
Notifications
You must be signed in to change notification settings - Fork 0
/
eeclassify.py
94 lines (94 loc) · 4.2 KB
/
eeclassify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import csv
import argparse
from dateutil import parser as dateparser
from collections import Counter,defaultdict
# Requires unidecode from PyPI
import unidecode
parser = argparse.ArgumentParser()
parser.add_argument("eeprocessed",help="the .csv output file (from eeprocess.py) to read")
parser.add_argument("eeanalysis",help="the .csv output file (from eeanalyze.py) to read")
args=parser.parse_args()
# Declare main and talk dicts, mapping article names to timestamps of their first main and talk edits respectively
main={}
talk={}
# Declare reasons and comments dicts, mapping article names to reasons and comments (from eeanalyze)
reasons={}
comments={}
# Read in CSVs
with open(args.eeprocessed) as fin:
reader=csv.reader(fin)
#Skip the header
next(reader)
#read in main and talk dicts
for row in reader:
main[row[0]]=row[1]
talk[row[0]]=row[2]
with open(args.eeanalysis) as fin:
reader=csv.reader(fin)
#Skip the header
next(reader)
#Read in reasons, filtering out revision deletion based on the comment field (I'm sure there's a better way, but the log_deleted field in the db which determines if a deletion is page or revision doesn't properly exist for all deletes or mwxml doesn't see it in all cases)
for row in reader:
if "rd1" not in row[2].lower():
reasons[row[0]]=row[1]
comments[row[0]]=row[2]
print("Read " + str(len(main)) + " main, " + str(len(talk)) + " talk, and " + str(len(reasons)) + " articles that were automatically analyzed (not counting revision deletions; they are false positives for my purposes).")
# Fix misclassified copyvios
for article,reason in reasons.items():
c=comments[article].lower()
if "copyright" not in reason and ("copyright" in c or "copyvio" in c or "g12" in c or "a8" in c):
reasons[article]="copyright (" + reasons[article] + ")"
# Classify articles affected by the Great Oops (15:52, 25 February 2002 UTC) and UseMod keep pages
reasons.update({a:"great oops" for a,ts in main.items() if dateparser.parse(ts).timestamp() <= 1014652320 and dateparser.parse(talk[a]).timestamp() <= 1014652320})
comments.update({a:"" for a,r in reasons.items() if r == "great oops"})
# find split histories (pages with identical names except caps and diacritics)
acounter=Counter([unidecode.unidecode(a).lower() for a in main])
splitkeys=[k for k,v in acounter.items() if v>1]
splithist=defaultdict(dict)
for a,ts in main.items():
k=unidecode.unidecode(a).lower()
if k in splitkeys:
splithist[k][dateparser.parse(ts).timestamp()]=a
for a,m in splithist.items():
t=sorted(m.keys())
reasons[m[t[0]]]="split from " + m[t[1]]
comments[m[t[0]]]=""
# Add unknowns
reasons.update({a:"unknown" for a in main if a not in reasons})
comments.update({a:"" for a,r in reasons.items() if r == "unknown"})
# Write eefinal.csv
print("Writing eefinal.csv...")
with open("eefinal.csv","w") as cam:
writer=csv.writer(cam)
writer.writerow(("article","first main","first talk","reason","comment"))
for a in sorted(reasons.keys()):
if reasons[a]=="unknown" and unidecode.unidecode(a).lower() in splitkeys:
continue
writer.writerow((a,main[a],talk[a],reasons[a],comments[a]))
print("CSV written. Generating stats...")
copyvios=0
copymoves=0
talkmoves=0
histsplits=0
oopses=0
unknowns=0
for a,r in reasons.items():
if r == "copyright":
copyvios+=1
elif r.startswith("copyright ("):
copymoves+=1
elif r.startswith("move from"):
talkmoves+=1
elif r.startswith("split from"):
histsplits+=1
elif r == "great oops":
oopses+=1
elif r == "unknown":
unknowns+=1
print(str(copyvios) + " articles were copyright violations.")
print(str(copymoves) + " articles were copyright violations, but a new page was moved over the violating material.")
print(str(talkmoves) + " articles were likely moved by cut and paste, while their talk pages were moved properly.")
print(str(histsplits) + " articles have split history, with differences in capitalization or diacritics in the title.")
print(str(oopses) + " articles were affected by the Great Oops or UseMod keep pages.")
print(str(unknowns-histsplits) + " articles could not be automatically analyzed.")
print("Done!")