In [62]:
import numpy as np
import pandas
import re
from bokeh.plotting import figure
from bokeh.io import output_notebook,output_file, show,install_notebook_hook
from bokeh.models import ColumnDataSource, Select,LabelSet, HoverTool, DatetimeAxis, TapTool, CustomJS, BoxZoomTool, PanTool
from bokeh.models import WheelZoomTool, UndoTool, RedoTool, ResetTool, ZoomInTool, ZoomOutTool, Axis, Text, Circle, MultiLine
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans,DBSCAN
import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bokeh.models.widgets import DataTable, TableColumn, Div, RangeSlider, Slider, Select
from bokeh.layouts import Row,widgetbox, Column
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.palettes import Category10
import math
from collections import Counter
import networkx as nx
from bokeh.models.graphs import from_networkx
import pprint
red = "#C91E17"
green = "#17C957"
grey = "#877C77"

In [10]:
file = open('data/Paraiso Edits.txt','r', errors='ignore')

lines = file.readlines()
#remove the first line
lines = lines[3:]
data = pandas.DataFrame(columns=['ID','timestamp','user','minorEdit','pageLength','comment','entireEdit'])
id=0
for line in lines:
        entireEdit = line
        tokens = re.split('\(|\)',line)
        timestampAndName = tokens[4]
        timestampAndName = timestampAndName.split(' ')
        name = timestampAndName[-2]
        timestamp = " ".join(timestampAndName[0:-2])
        timestamp = pandas.to_datetime(timestamp)
        m = tokens[6]
        m = True if (m==' m ') else False
        pageLength = tokens[7].split(' ')[0]
        pageLength = pageLength.replace(',','')
        if(pageLength.isdigit()):
            pageLength = int(pageLength)
            if(len(tokens) >= 10):
                comment = tokens[9]
            else:
                comment = ""
        else:
            pageLength = -1
            comment = tokens[7]
        id += 1
        data.loc[len(data)]=[id,timestamp,name,m,pageLength,comment,entireEdit]
        


In [11]:
data = data.sort_values('timestamp',ascending=True)
previousPageLength = 0
previousEdit = -1
for index,row in data.iterrows():
    data.loc[index,'previousEdit'] = previousEdit
    previousEdit = row['ID']
    if(row['pageLength'] == -1):
        data.loc[index,'editDiff'] = 0
    else:
        data.loc[index,'editDiff'] = row['pageLength'] - previousPageLength
        previousPageLength = row['pageLength']

In [12]:
editLengths = []
for item in data['editDiff'].iteritems():
    editLengths.append(abs(item[1]))

counter = pandas.Series(Counter(editLengths))
top = list(counter.values)
left = counter.index
p1 = figure(title="Edit Size Distribution")
p1.circle(y=top, x=left)
p1.xaxis[0].formatter.use_scientific = False
p1.xaxis.axis_label = 'Bytes'
p1.yaxis.axis_label = 'Count'
output_notebook()
show(p1)

We can see that there is a gap in the size distribution. Thereofore, we will pick all edits above the size of 60000 bytes for our first network.

In [13]:
bigEdits = data.loc[abs(data['editDiff'])>60000]
bigEdits

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff
1008,1009,2006-08-11 16:42:00,Gerardmd,False,99907,Clean up refs a little,"# (cur) (last) 16:42, 11 August 2006 Gerardmd ...",-1.0,99907.0
916,917,2006-08-25 17:04:00,Alejandrosanchez,True,20,?Replaced page with 'Paraiso is BULL',"# (cur) (last) 17:04, 25 August 2006 Alejandro...",918.0,-101047.0
915,916,2006-08-25 17:04:00,BakBOT,False,101067,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 17:04, 25 August 2006 BakBOT (T...",917.0,101047.0
875,876,2006-08-27 20:16:00,201.226.51.x,False,68,?Replaced page with 'Paraiso believes that hum...,"# (cur) (last) 20:16, 27 August 2006 201.226.5...",877.0,-100174.0
874,875,2006-08-27 20:17:00,BakBOT,False,100242,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 20:17, 27 August 2006 BakBOT (T...",876.0,100174.0
846,847,2006-08-28 21:20:00,Honoratas,False,69,?Replaced page with 'Paraiso: A made up religi...,"# (cur) (last) 21:20, 28 August 2006 Honoratas...",848.0,-93138.0
845,846,2006-08-28 21:21:00,Alano,True,93207,Reverted edits by Honoratas,"# (cur) (last) 21:21, 28 August 2006 Alano (Ta...",847.0,93138.0
844,845,2006-08-28 21:25:00,Honoratas,False,103,?Replaced page with '{{ParaisoSeries}} '''Para...,"# (cur) (last) 21:25, 28 August 2006 Honoratas...",846.0,-93104.0
842,843,2006-08-28 21:37:00,Ginebra,False,93207,,"# (cur) (last) 21:37, 28 August 2006 Ginebra (...",844.0,93088.0
830,831,2006-08-30 02:56:00,71.59.210.x,False,17,?Replaced page with 'DEFINITION : CULT',"# (cur) (last) 02:56, 30 August 2006 71.59.210...",832.0,-93189.0


I remove the first row, the edit by user **Gerardmd** since it is the initial edit and therefore is not a case of vandalism or reverting vandalism.

In [14]:
bigEdits = bigEdits[1:]
bigEdits

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff
916,917,2006-08-25 17:04:00,Alejandrosanchez,True,20,?Replaced page with 'Paraiso is BULL',"# (cur) (last) 17:04, 25 August 2006 Alejandro...",918.0,-101047.0
915,916,2006-08-25 17:04:00,BakBOT,False,101067,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 17:04, 25 August 2006 BakBOT (T...",917.0,101047.0
875,876,2006-08-27 20:16:00,201.226.51.x,False,68,?Replaced page with 'Paraiso believes that hum...,"# (cur) (last) 20:16, 27 August 2006 201.226.5...",877.0,-100174.0
874,875,2006-08-27 20:17:00,BakBOT,False,100242,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 20:17, 27 August 2006 BakBOT (T...",876.0,100174.0
846,847,2006-08-28 21:20:00,Honoratas,False,69,?Replaced page with 'Paraiso: A made up religi...,"# (cur) (last) 21:20, 28 August 2006 Honoratas...",848.0,-93138.0
845,846,2006-08-28 21:21:00,Alano,True,93207,Reverted edits by Honoratas,"# (cur) (last) 21:21, 28 August 2006 Alano (Ta...",847.0,93138.0
844,845,2006-08-28 21:25:00,Honoratas,False,103,?Replaced page with '{{ParaisoSeries}} '''Para...,"# (cur) (last) 21:25, 28 August 2006 Honoratas...",846.0,-93104.0
842,843,2006-08-28 21:37:00,Ginebra,False,93207,,"# (cur) (last) 21:37, 28 August 2006 Ginebra (...",844.0,93088.0
830,831,2006-08-30 02:56:00,71.59.210.x,False,17,?Replaced page with 'DEFINITION : CULT',"# (cur) (last) 02:56, 30 August 2006 71.59.210...",832.0,-93189.0
829,830,2006-08-30 02:57:00,BakBOT,False,93206,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 02:57, 30 August 2006 BakBOT (T...",831.0,93189.0


We will add positive or negative column to the bigEdits data frame depending on the editDiff.

In [15]:
bigEdits = bigEdits[bigEdits.user != 'BakBOT']
bigEdits.is_copy = False
for index,row in bigEdits.iterrows():
    if(row['editDiff'] < 0):
        bigEdits.loc[index,'faction'] = 'negative'
    else:
        bigEdits.loc[index,'faction'] = 'positive'
        
bigEdits

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff,faction
916,917,2006-08-25 17:04:00,Alejandrosanchez,True,20,?Replaced page with 'Paraiso is BULL',"# (cur) (last) 17:04, 25 August 2006 Alejandro...",918.0,-101047.0,negative
875,876,2006-08-27 20:16:00,201.226.51.x,False,68,?Replaced page with 'Paraiso believes that hum...,"# (cur) (last) 20:16, 27 August 2006 201.226.5...",877.0,-100174.0,negative
846,847,2006-08-28 21:20:00,Honoratas,False,69,?Replaced page with 'Paraiso: A made up religi...,"# (cur) (last) 21:20, 28 August 2006 Honoratas...",848.0,-93138.0,negative
845,846,2006-08-28 21:21:00,Alano,True,93207,Reverted edits by Honoratas,"# (cur) (last) 21:21, 28 August 2006 Alano (Ta...",847.0,93138.0,positive
844,845,2006-08-28 21:25:00,Honoratas,False,103,?Replaced page with '{{ParaisoSeries}} '''Para...,"# (cur) (last) 21:25, 28 August 2006 Honoratas...",846.0,-93104.0,negative
842,843,2006-08-28 21:37:00,Ginebra,False,93207,,"# (cur) (last) 21:37, 28 August 2006 Ginebra (...",844.0,93088.0,positive
830,831,2006-08-30 02:56:00,71.59.210.x,False,17,?Replaced page with 'DEFINITION : CULT',"# (cur) (last) 02:56, 30 August 2006 71.59.210...",832.0,-93189.0,negative
807,808,2006-08-31 06:23:00,204.52.215.x,False,209,?Replaced page with '{{ParaisoSeries}}'''Parai...,"# (cur) (last) 06:23, 31 August 2006 204.52.21...",809.0,-91435.0,negative
802,803,2006-08-31 12:59:00,75.179.21.x,False,38,?Replaced page with 'is stupid you idiots',"# (cur) (last) 12:59, 31 August 2006 75.179.21...",802.0,-91606.0,negative
799,800,2006-08-31 13:01:00,Hispa,True,91644,Reverted 1 edit by 75.179.21.x identified as v...,"# (cur) (last) 13:01, 31 August 2006 Hispa (Ta...",801.0,91606.0,positive


In [331]:
users = set(list(bigEdits['user']))
fillColor = []
size = []
graph = nx.Graph()
graph.add_node('negative')
fillColor.append(red)
size.append(20)
graph.add_node('positive')
fillColor.append(green)
size.append(20)


for user in users:
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    faction = list(userRows['faction'])[0]
    if(faction  == 'negative'):
        fillColor.append(red)
        graph.add_edge(user, 'negative', weight=1 )
        size.append(10)
    else:
        fillColor.append(green)
        graph.add_edge(user, 'positive', weight=1 )
        size.append(10)

prevUser = 'none'
graph.add_edge('negative', 'positive', weight=-500 )
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=-10 ) 
    prevUser = row[1]['user']
    
hover = HoverTool(tooltips = [("User", "@index")])
plot = figure(title="Wiki Edits Network", x_range=(-5,5), y_range=(-5,5),tools=[hover])
plotGraph = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='e62553ca-4f83-4474-80ae-320b1582f9b5', ...)]


Get all reverts, undos and user mentions:

In [332]:
usernames = list(set(list(data['user'])))
len(usernames)

387

In [333]:
smallEdits = data.loc[abs(data['editDiff'])<=60000]
conflictRows = pandas.DataFrame(columns=list(data.columns.values))
conflictRowsUserMentions = pandas.DataFrame(columns=list(data.columns.values))
conflictRowsNoUserMentions = pandas.DataFrame(columns=list(data.columns.values))
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search


for row in smallEdits.iterrows():
    rowAdded = False
    for user in usernames:
        if(findWholeWord(user)(row[1]['comment'])):
            currentRow = len(conflictRows)
            currentRow2 = len(conflictRowsUserMentions)
            for key,value in row[1].iteritems():
                conflictRows.loc[currentRow,key] = value
                conflictRowsUserMentions.loc[currentRow2,key] = value
            rowAdded = True
            break
    if(rowAdded == False):
        currentRow = len(conflictRows)
        currentRow2 = len(conflictRowsNoUserMentions)
        if(findWholeWord("rv")(row[1]['comment'])):
            for key,value in row[1].iteritems():
                conflictRows.loc[currentRow,key] = value
                conflictRowsNoUserMentions.loc[currentRow2,key] = value
        elif(findWholeWord("rv.")(row[1]['comment'])):
            for key,value in row[1].iteritems():
                conflictRows.loc[currentRow,key] = value
                conflictRowsNoUserMentions.loc[currentRow2,key] = value
        elif(findWholeWord("undid")(row[1]['comment'])):
            for key,value in row[1].iteritems():
                conflictRows.loc[currentRow,key] = value
                conflictRowsNoUserMentions.loc[currentRow2,key] = value

print(len(conflictRowsUserMentions),len(conflictRowsNoUserMentions))


228 22


In [334]:
conflictRows.head()

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff
0,1005,2006-08-12 01:47:00,Gracia,False,99909,rv - vandalism,"# (cur) (last) 01:47, 12 August 2006 Gracia (T...",1006,-53
1,1003,2006-08-14 02:30:00,Savanna,False,99890,Handling DailosTamanca's objection to my earli...,"# (cur) (last) 02:30, 14 August 2006 Savanna (...",1004,-17
2,994,2006-08-14 16:44:00,VictoriaV,True,100699,Reverted to revision xxxxxxxxx by Savanna; pri...,"# (cur) (last) 16:44, 14 August 2006 VictoriaV...",995,-493
3,988,2006-08-14 19:06:00,Sara,False,100700,rv. pp. 112 - 130,"# (cur) (last) 19:06, 14 August 2006 Sara (Tal...",989,141
4,986,2006-08-14 19:53:00,Agustin,True,100700,Undid revision xxxxxxxxx by KesheR,"# (cur) (last) 19:53, 14 August 2006 Agustin (...",987,-77


In [335]:
def modify_doc(doc):
    def update(attr, old, new):
        comment = source.data['comment'][source.selected['1d']['indices'][0]]
        ID = source.data['ID'][source.selected['1d']['indices'][0]]
        selectedUser = source.data['user'][source.selected['1d']['indices'][0]]
        pageLength =  source.data['pageLength'][source.selected['1d']['indices'][0]]
        editDiff =  source.data['editDiff'][source.selected['1d']['indices'][0]]
        previousEditIndex = source.data['previousEdit'][source.selected['1d']['indices'][0]]
        previousEdit = data.loc[data['ID'] == previousEditIndex]
        previousComment = previousEdit['comment'].item()
        previousPageLength = previousEdit['pageLength'].item()
        previousEditDiff = previousEdit['editDiff'].item()
        
        tokens = re.split(" ",comment)
        editedComment = ""
        for token in tokens:
            tokenAdded = False
            for user in usernames:
                if(user in token):
                    editedComment = editedComment +" <span style='background:#FFD6E1'>" + token + "</span>"
                    tokenAdded = True
                    break;
            if(not tokenAdded):
                editedComment = editedComment +" " +  token
            
        layout.children[1].children[1] = Div(text="<p style='padding-left:100px;'>"+"<b>ID:</b>"+str(ID)+"</p>"
                                 +"<p style='padding-left:100px;'>"+"<b>User:</b>"+selectedUser+"</p>"
                                 +"<p style='padding-left:100px;'><b>Comment:</b>"+editedComment+"</p>"
                                 +"<p style='padding-left:100px;'><b>Page Length:</b>"+str(pageLength)+"</p>"
                                 +"<p style='padding-left:100px;'><b>Edit Diff:</b>"+str(editDiff)+"</p>"
                                 +"<p style='padding-left:100px;'><b>Previous comment:</b>"+previousComment+"</p>"
                                 +"<p style='padding-left:100px;'><b>Previous Page Length:</b>"+str(previousPageLength)+"</p>"
                                 +"<p style='padding-left:100px;'><b>Previous Edit Diff:</b>"+str(previousEditDiff)+"</p>"
                                             ,width=400, height=100)
     
    
    def updateTable(attr,old,new):
        if(groupSelect.value == 'All'):
            source.data = dict(
                ID = conflictRows['ID'],
                comment=conflictRows['comment'],
                user=conflictRows['user'],
                pageLength= conflictRows['pageLength'],
                editDiff = conflictRows['editDiff'],
                previousEdit = conflictRows['previousEdit']
            )
        elif(groupSelect.value == 'User Mentions'):
            source.data = dict(
                ID = conflictRowsUserMentions['ID'],
                comment=conflictRowsUserMentions['comment'],
                user=conflictRowsUserMentions['user'],
                pageLength= conflictRowsUserMentions['pageLength'],
                editDiff = conflictRowsUserMentions['editDiff'],
                previousEdit = conflictRowsUserMentions['previousEdit']
            )
        else:
            source.data = dict(
                ID = conflictRowsNoUserMentions['ID'],
                comment=conflictRowsNoUserMentions['comment'],
                user=conflictRowsNoUserMentions['user'],
                pageLength= conflictRowsNoUserMentions['pageLength'],
                editDiff = conflictRowsNoUserMentions['editDiff'],
                previousEdit = conflictRowsNoUserMentions['previousEdit']
            )
        layout.children[1].children[0] = create_table()
        
    source = ColumnDataSource(data = dict(
            ID = conflictRows['ID'],
            comment=conflictRows['comment'],
            user=conflictRows['user'],
            pageLength= conflictRows['pageLength'],
            editDiff = conflictRows['editDiff'],
            previousEdit = conflictRows['previousEdit']
    ))
    source.on_change('selected', update)
    
    groupSelect = Select(title="Select Group:", value="All", options=["All", "User Mentions", "No User Mentions"])
    groupSelect.on_change('value',updateTable)
    
    def create_table():
        
        columns = [
            TableColumn(field="comment", title="Comment"),
            TableColumn(field="user", title="User"),
        ]
        data_table = DataTable(source=source, columns=columns, selectable=True, width= 500)
        return data_table
    div = Div(text="",width=400, height=100)
    table = create_table()
    layout = Column(groupSelect, Row(table,div))
    doc.add_root(layout)

output_notebook()    
handler = FunctionHandler(modify_doc)
app = Application(handler)
show(app,notebook_url='localhost:8888')


We ignore the following comments because they are reverted twice thus cancelling the previous revert and in the last case, we are not able to find the edit talked about.                                            

In [336]:
ignoredComments = data.loc[data['ID'].isin([785,784,530,534,488,489,336,260,80])]
for edit in list(ignoredComments.entireEdit):
    print(edit)


# (cur) (last) 01:26, 1 September 2006 Sara (Talk | contribs) (92,102 bytes) (rv. the new source doesn't even mention Paraiso and rather contradicts the claim)

# (cur) (last) 01:36, 1 September 2006 Sara (Talk | contribs) (92,025 bytes) (rv myself. dammit it does in a table ):)

# (cur) (last) 05:55, 16 September 2006 Sara (Talk | contribs) (97,261 bytes) (rv Sofia's last edit wich he labeled with "Pope statement" but deleted conclusions from the German Government including a source)

# (cur) (last) 09:01, 16 September 2006 Sara (Talk | contribs) (97,555 bytes) (rv myself, didn't see that this was part of the lead. sorry)

# (cur) (last) 02:28, 21 September 2006 Sara (Talk | contribs) (97,869 bytes) (Undid revision xxxxxxxxx by RyogaNica (talk) don't mark edits as minor if you delete source)

# (cur) (last) 02:40, 21 September 2006 Sara (Talk | contribs) (97,692 bytes) (rv. myself I see now, you doubled it before... ok ): , not my day)

# (cur) (last) 11:24, 26 October 2006 Sara (Talk

In [337]:
conflictRows = conflictRows.loc[~conflictRows['ID'].isin([785,784,530,534,488,489,336,260,80])]
len(conflictRows)

241

Extract Features from the conflict rows:

In [338]:
vectorizer = TfidfVectorizer(stop_words = 'english',analyzer  = 'word',min_df = 1, max_df = 1.0)
tfidf_result = vectorizer.fit_transform(list(conflictRows['comment']))

def getScores(vectorizer, tfidf_result):
    allFeatures = vectorizer.get_feature_names()
    scores = zip(allFeatures,np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return sorted_scores


featureScores = getScores(vectorizer, tfidf_result)
selectedFeatures = []
for score in featureScores:
    if(not re.compile("^[0-9]*$").match(score[0])):
        if(score[1] > 1):
            selectedFeatures.append(score)

featureList = []
for feature in selectedFeatures:
    featureList.append(feature[0])
featureList.append('All')

selectedFeatures

[('revision', 25.568453708277822),
 ('xxxxxxxxx', 24.672251895798293),
 ('undid', 23.862400062033075),
 ('reverted', 18.383691658125684),
 ('vandalism', 13.684601899147584),
 ('edits', 13.154243502006647),
 ('edit', 12.314267541131485),
 ('using', 11.901982947569813),
 ('tw', 11.325941522690446),
 ('identified', 8.7770058972665321),
 ('victoriav', 8.7530420317851636),
 ('rv', 8.1011022668811972),
 ('rm99', 4.8680566475870881),
 ('agustin', 4.2954009657324805),
 ('possible', 4.2691088216823081),
 ('reverting', 4.024733487245963),
 ('pov', 3.8731577413426188),
 ('contributions', 3.8113401918807703),
 ('special', 3.8113401918807703),
 ('amado', 3.0030056744242795),
 ('version', 2.9948042057579771),
 ('socorro', 2.930540827768934),
 ('bakbot', 2.7621260055109973),
 ('ryoganica', 2.6227165377775772),
 ('dailostamanca', 2.6177344834938494),
 ('gustava', 2.5636502859893557),
 ('edemir', 2.2394931682500472),
 ('wp', 2.2353337181221238),
 ('pushing', 1.9843827341997005),
 ('unsourced', 1.948012

In [339]:
def modify_doc2(doc):
    def update(attr, old, new):
        comment = source.data['comment'][source.selected['1d']['indices'][0]]
        ID = source.data['ID'][source.selected['1d']['indices'][0]]
        selectedUser = source.data['user'][source.selected['1d']['indices'][0]]
        pageLength =  source.data['pageLength'][source.selected['1d']['indices'][0]]
        editDiff =  source.data['editDiff'][source.selected['1d']['indices'][0]]
        
        tokens = re.split(" ",comment)
        editedComment = ""
        for token in tokens:
            tokenAdded = False
            for user in usernames:
                if(user in token):
                    editedComment = editedComment +" <span style='background:#FFD6E1'>" + token + "</span>"
                    tokenAdded = True
                    break;
            if(not tokenAdded):
                for feature in featureList:
                    if(feature in token.lower()):
                        if(feature == featureSelect.value):
                            editedComment = editedComment +" <span style='background:#9EFFB7;'>" + token + "</span>"
                        else:
                            editedComment = editedComment +" <span style='background:#B0E8FF'>" + token + "</span>"
                        tokenAdded = True
                        break;
            if(not tokenAdded):
                editedComment = editedComment +" " +  token
            
        layout.children[1].children[1] = Div(text="<p style='padding-left:100px;'>"+"<b>ID:</b>"+str(ID)+"</p>"
                                 +"<p style='padding-left:100px;'>"+"<b>User:</b>"+selectedUser+"</p>"
                                 +"<p style='padding-left:100px;'><b>Comment:</b>"+editedComment+"</p>"
                                 +"<p style='padding-left:100px;'><b>Page Length:</b>"+str(pageLength)+"</p>"
                                 +"<p style='padding-left:100px;'><b>Edit Diff:</b>"+str(editDiff)+"</p>"
                                ,width=400, height=100)
     
    
    def updateTable(attr,old,new):
        if(featureSelect.value == 'All'):
            source.data = dict(
                ID = conflictRows['ID'],
                comment=conflictRows['comment'],
                user=conflictRows['user'],
                pageLength= conflictRows['pageLength'],
                editDiff = conflictRows['editDiff'],
                previousEdit = conflictRows['previousEdit']
            )
        else:
            selectedRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
            for row in conflictRows.iterrows():
                if(findWholeWord(featureSelect.value)(row[1]['comment'])):
                        currentRow = len(selectedRows)
                        for key,value in row[1].iteritems():
                            selectedRows.loc[currentRow,key] = value
            source.data = dict(
                ID = selectedRows['ID'],
                comment=selectedRows['comment'],
                user=selectedRows['user'],
                pageLength= selectedRows['pageLength'],
                editDiff = selectedRows['editDiff'],
                previousEdit = selectedRows['previousEdit']
            )
        layout.children[1].children[0] = create_table()
        
    source = ColumnDataSource(data = dict(
            ID = conflictRows['ID'],
            comment=conflictRows['comment'],
            user=conflictRows['user'],
            pageLength= conflictRows['pageLength'],
            editDiff = conflictRows['editDiff'],
            previousEdit = conflictRows['previousEdit']
    ))
    source.on_change('selected', update)
    
    featureSelect = Select(title="Select Feature:", value="All", options=featureList)
    featureSelect.on_change('value',updateTable)
    
    def create_table():
        
        columns = [
            TableColumn(field="comment", title="Comment"),
            TableColumn(field="user", title="User"),
        ]
        data_table = DataTable(source=source, columns=columns, selectable=True, width= 500)
        return data_table
    div = Div(text="",width=400, height=100)
    table = create_table()
    layout = Column(featureSelect, Row(table,div))
    doc.add_root(layout)

output_notebook()    
handler2 = FunctionHandler(modify_doc2)
app2 = Application(handler2)
show(app2,notebook_url='localhost:8888')

### Weight Calculation
#### Word : Undid

In [340]:
undidRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('undid')(row[1]['comment'])):
        currentRow = len(undidRows)
        for key,value in row[1].iteritems():
            undidRows.loc[currentRow,key] = value
    elif(findWholeWord('undo')(row[1]['comment'])):
        currentRow = len(undidRows)
        for key,value in row[1].iteritems():
            undidRows.loc[currentRow,key] = value

len(undidRows)

            

85

In [341]:
for row in undidRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    count = 0
    for token in tokens:
        for user in usernames:
            if(user in token):
                count+=1
                break
    if(count > 1):
        print(row[1]['comment'])

Undid revision xxxxxxxxx by VictoriaV reverted link butchered by VictoriaV


In [342]:
edges = []
def addEdge(user1,user2, weight):
    global edges
    edgeFound = False
    for edge in edges:
        if((edge[0] == user1 and edge[1] == user2) or (edge[0] == user2 and edge[1] == user1)):
            edge[2] += weight
            edgeFound = True
    if(not edgeFound):
            edges.append([user1,user2,weight])
        

In [343]:
for row in undidRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    edgeAdded = False
    for token in tokens:
        for user in usernames:
            if(user.lower() in token.lower()):
                addEdge(row[1]['user'],user,-7)
                if(findWholeWord('pov')(row[1]['comment'])):
                    addEdge(row[1]['user'],user,-1)
                edgeAdded = True
                break
        if(edgeAdded):
            break
    if(not edgeAdded):
        previousEdit = data.loc[data['ID'] == row[1]['previousEdit']]
        addEdge(row[1]['user'],previousEdit['user'].item(),-7)
        if(findWholeWord('pov')(row[1]['comment'])):
                    addEdge(row[1]['user'],previousEdit['user'].item(),-1)
        
edges

[['Agustin', 'KesheR', -7],
 ['Sara', 'VictoriaV', -7],
 ['VictoriaV', 'Agustin', -50],
 ['Savanna', 'Agustin', -21],
 ['DailosTamanca', 'VictoriaV', -14],
 ['Savanna', 'DailosTamanca', -7],
 ['80.175.28.x', '124.186.233.x', -7],
 ['Salvadora', 'o10101100', -7],
 ['Curcio', 'o10101100', -7],
 ['Ricarda', '86.135.211.x', -7],
 ['Agustin', '65.91.29.x', -7],
 ['Jfreyre', '65.95.124.x', -7],
 ['Sara', '86.16.191.x', -7],
 ['Curcio', '70.178.167.x', -7],
 ['Sara', '88.91.134.x', -7],
 ['Sara', '68.101.65.x', -7],
 ['79.65.65.x', '152.10.189.x', -7],
 ['Ricarda', '69.74.29.x', -7],
 ['Kurrop', 'Alejo', -14],
 ['Eddiejr00', 'Hortencia', -7],
 ['Sara', '81.156.43.x', -7],
 ['DailosTamanca', 'Ria', -7],
 ['Amado', '90.196.227.x', -7],
 ['Sara', 'Guillebelloc', -7],
 ['Solana', '216.73.77.x', -7],
 ['Edemir', '170.35.208.x', -7],
 ['Alfredo', '82.152.249.x', -14],
 ['DailosTamanca', '82.152.249.x', -7],
 ['AlbertoMaria', 'Ria', -7],
 ['Edemir', '64.4.113.x', -7],
 ['81.132.254.x', 'Gerarda', -7

In [345]:
graph = nx.Graph()
fillColor = []
size = []
graph.add_node('negative')
fillColor.append(red)
size.append(20)
graph.add_node('positive')
fillColor.append(green)
size.append(20)
graph.add_edge('positive', 'negative', weight=-500 )
users = list(bigEdits['user'])
for edge in edges:
    users.append(edge[0])
    users.append(edge[1])
users = list(set(users))
for user in users:
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    if(len(userRows)):
        faction = list(userRows['faction'])[0]
        if(faction  == 'negative'):
            fillColor.append(red)
            graph.add_edge(user, 'negative', weight=10 )
            size.append(10)
        else:
            fillColor.append(green)
            graph.add_edge(user, 'positive', weight=10 )
            size.append(10)
    else:
        fillColor.append(grey)
        size.append(10)
        
prevUser = 'none'
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=-10 )
    prevUser = row[1]['user']
    
for edge in edges:
    graph.add_edge(edge[0],edge[1],weight = edge[2])

    
    
hover = HoverTool(tooltips = [("User", "@index")])

plot = figure(title="Wiki Edits Network with 'Undid' edits", x_range=(-5,5), y_range=(-5,5),tools=[hover,TapTool(),
    PanTool(),BoxZoomTool(),ResetTool()])
plotGraph = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plotGraph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='a1efea7b-dcec-489e-8a8c-3313cf66e7cc', ...)]


#### Word : Reverted/Revert

In [346]:
revertRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('reverted')(row[1]['comment'])):
        currentRow = len(revertRows)
        for key,value in row[1].iteritems():
            revertRows.loc[currentRow,key] = value
    elif(findWholeWord('revert')(row[1]['comment'])):
        currentRow = len(revertRows)
        for key,value in row[1].iteritems():
            revertRows.loc[currentRow,key] = value
    elif(findWholeWord('rv')(row[1]['comment'])):
        currentRow = len(revertRows)
        for key,value in row[1].iteritems():
            revertRows.loc[currentRow,key] = value
    elif(findWholeWord('rvt')(row[1]['comment'])):
        currentRow = len(revertRows)
        for key,value in row[1].iteritems():
            revertRows.loc[currentRow,key] = value
    elif(findWholeWord('reverting')(row[1]['comment'])):
        currentRow = len(revertRows)
        for key,value in row[1].iteritems():
            revertRows.loc[currentRow,key] = value

len(revertRows)

143

In [347]:
for row in revertRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    if('revert' not in tokens[0].lower() and 'rv' != tokens[0].lower()[0:2]):
        print(row[1]['comment'])

partly reverted RyogaNica. ... not all "critical" sites are personal sites
BOT - Reverted edits by Gregoria {information} to revision #xxxxxxxxx by "AlbertoMaria".
BOT - Reverted edits by 71.136.71.x {vandalism} to revision #xxxxxxxxx by "Arate".
missed some of 66.66.125.x?'s unsourced POV-pushing earlier. Rv back to Agustin's version.
Automatically reverting possible vandalism by 74.212.7.x to last good revision by Sarita
Undid revision xxxxxxxxx by VictoriaV reverted link butchered by VictoriaV
BOT - Reverted edits by 86.41.68.x {possible vandalism} to revision #xxxxxxxxx by "Airunp".
BOT - Reverted edits by McGonagal {possible vandalism} to revision #xxxxxxxxx by "Ismaelsmr".


In [348]:
for row in revertRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    revertFound = False
    toFound = False
    editsReverted = 0
    forUser=''
    againstUser = ''
    for index,token in enumerate(tokens):
        if(not revertFound):
            if('revert' in token.lower() or 'rv' != token.lower()[0:2]):
                revertFound = True
            else:
                for user in usernames:
                    if(user.lower() in token.lower()):
                        againstUser = user
        else:
            if('to' == token):
                toFound = True
            elif(re.compile("^[0-9]+$").match(token)):
                if(index < len(tokens) - 1):
                    if('edit' in tokens[index+1].lower()):
                        editsReverted = int(token)
            else:
                for user in usernames:
                    if(user.lower() in token.lower()):
                        if(toFound and forUser == ''):
                            forUser = user
                        elif(againstUser == ''):
                            againstUser = user
    if(againstUser == ''):
        if(forUser == ''):
            toEdit = data.loc[data['ID'] == (row[1]['ID'] + 2 )]
            removedEdit = data.loc[data['ID'] == (row[1]['ID'] + 1 )]
            if(findWholeWord('vandalism')(row[1]['comment'])):
                addEdge(row[1]['user'],removedEdit['user'].item(),-3)
            if(findWholeWord('pov')(row[1]['comment'])):
                addEdge(row[1]['user'],removedEdit['user'].item(),-1)
            if(findWholeWord('unsourced')(row[1]['comment'])):
                addEdge(row[1]['user'],removedEdit['user'].item(),-1)
            addEdge(row[1]['user'],removedEdit['user'].item(),-7)
            if(row[1]['pageLength'] == toEdit['pageLength'].item()):
                addEdge(row[1]['user'],toEdit['user'].item(),2)
        else:
            previousCommitsByForUser = data.loc[(data['ID']>row[1]['ID']) & (data['user'] == forUser) & (data['pageLength'] == row[1]['pageLength'])]
            previousCommitIndex = previousCommitsByForUser['ID'].min()
            commitsInBetween = data.loc[(data['ID']>row[1]['ID']) & (data['ID'] < previousCommitIndex)]
            for user in list(commitsInBetween['user']):
                if(findWholeWord('vandalism')(row[1]['comment'])):
                    addEdge(row[1]['user'],user,-3)
                if(findWholeWord('pov')(row[1]['comment'])):
                    addEdge(row[1]['user'],user,-1)
                if(findWholeWord('unsourced')(row[1]['comment'])):
                    addEdge(row[1]['user'],user,-1)
                addEdge(row[1]['user'],user,-7)
            addEdge(row[1]['user'],forUser,2)
    elif(forUser == ''):
        if(editsReverted !=0):
            toEdit = data.loc[data['ID'] == (row[1]['ID'] + 1 + editsReverted)]
            forUser = toEdit['user'].item()
            if(findWholeWord('vandalism')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-3 * editsReverted)
            if(findWholeWord('pov')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1 * editsReverted)
            if(findWholeWord('unsourced')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1 * editsReverted)
            addEdge(row[1]['user'],againstUser,-7 * editsReverted)
            addEdge(row[1]['user'],forUser,2)
        else:
            toEdit = data.loc[data['ID'] == (row[1]['ID'] + 2 )]
            if('good faith' in row[1]['comment']):
                addEdge(row[1]['user'],againstUser,3)
            if(findWholeWord('vandalism')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-3)
            if(findWholeWord('pov')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1)
            if(findWholeWord('unsourced')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1)
            addEdge(row[1]['user'],againstUser,-7)
            if(row[1]['pageLength'] == toEdit['pageLength'].item()):
                addEdge(row[1]['user'],toEdit['user'].item(),2)
    else:
        if(editsReverted !=0):
            if(findWholeWord('vandalism')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-3 * editsReverted)
            if(findWholeWord('pov')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1 * editsReverted)
            if(findWholeWord('unsourced')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1 * editsReverted)
            addEdge(row[1]['user'],againstUser,-7 * editsReverted)
            addEdge(row[1]['user'],forUser,2)
        else:
            if(findWholeWord('vandalism')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-3)
            if(findWholeWord('pov')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1)
            if(findWholeWord('unsourced')(row[1]['comment'])):
                addEdge(row[1]['user'],againstUser,-1)
            addEdge(row[1]['user'],againstUser,-7)
            addEdge(row[1]['user'],forUser,2)


In [349]:
graph = nx.Graph()
fillColor = []
size = []
graph.add_node('negative')
fillColor.append(red)
size.append(20)
graph.add_node('positive')
fillColor.append(green)
size.append(20)
graph.add_edge('positive', 'negative', weight=-500 )
users = list(bigEdits['user'])
for edge in edges:
    users.append(edge[0])
    users.append(edge[1])
users = list(set(users))
for user in users:
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    if(len(userRows)):
        faction = list(userRows['faction'])[0]
        if(faction  == 'negative'):
            fillColor.append(red)
            graph.add_edge(user, 'negative', weight=10 )
            size.append(10)
        else:
            fillColor.append(green)
            graph.add_edge(user, 'positive', weight=10 )
            size.append(10)
    else:
        fillColor.append(grey)
        size.append(10)
        
prevUser = 'none'
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=-10 )
    prevUser = row[1]['user']
    
for edge in edges:
    graph.add_edge(edge[0],edge[1],weight = edge[2])

    
    
hover = HoverTool(tooltips = [("User", "@index")])

plot = figure(title="Wiki Edits Network with 'Undid' edits", x_range=(-5,5), y_range=(-5,5),tools=[hover,TapTool(),
    PanTool(),BoxZoomTool(),ResetTool()])
plotGraph = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plotGraph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='a26db7be-0b51-4bd1-9080-a7ef14cb030e', ...)]


 #### Word: Vandalism

In [350]:
revertIds = list(revertRows['ID'])
vandalismRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('vandalism')(row[1]['comment']) and row[1]['ID'] not in revertIds):
        currentRow = len(vandalismRows)
        for key,value in row[1].iteritems():
            vandalismRows.loc[currentRow,key] = value

len(vandalismRows)

2

In [351]:
list(vandalismRows['comment'])

['restored article to edit before vandalism/edit-warring by 66.66.125.x',
 'Removed blatant vandalism from lead Pablo6213: "Paraiso is a lie......"']

In [352]:
for row in vandalismRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    for token in tokens:
        for user in usernames:
            if(user.lower() in token.lower()):
                addEdge(row[1]['user'],user,-10)

#### Word: Pov

In [353]:
undidIds = list(undidRows['ID'])
povRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('pov')(row[1]['comment']) and row[1]['ID'] not in revertIds and row[1]['ID'] not in undidIds):
        currentRow = len(povRows)
        for key,value in row[1].iteritems():
            povRows.loc[currentRow,key] = value

len(povRows)

1

In [354]:
list(povRows['comment'])

['Stop your POV pushing Rm99!!!!']

In [355]:
for row in povRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    for token in tokens:
        for user in usernames:
            if(user.lower() in token.lower()):
                addEdge(row[1]['user'],user,-9)

#### Word: Unsourced

In [356]:
unsourcedRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('unsourced')(row[1]['comment']) and row[1]['ID'] not in revertIds):
        currentRow = len(unsourcedRows)
        for key,value in row[1].iteritems():
            unsourcedRows.loc[currentRow,key] = value

len(unsourcedRows)

0

In [357]:
graph = nx.Graph()
fillColor = []
size = []
graph.add_node('negative')
fillColor.append(red)
size.append(20)
graph.add_node('positive')
fillColor.append(green)
size.append(20)
graph.add_edge('positive', 'negative', weight=-500 )
users = list(bigEdits['user'])
for edge in edges:
    users.append(edge[0])
    users.append(edge[1])
users = list(set(users))
for user in users:
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    if(len(userRows)):
        faction = list(userRows['faction'])[0]
        if(faction  == 'negative'):
            fillColor.append(red)
            graph.add_edge(user, 'negative', weight=10 )
            size.append(10)
        else:
            fillColor.append(green)
            graph.add_edge(user, 'positive', weight=10 )
            size.append(10)
    else:
        fillColor.append(grey)
        size.append(10)
        
prevUser = 'none'
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=-10 )
    prevUser = row[1]['user']
    
for edge in edges:
    graph.add_edge(edge[0],edge[1],weight = edge[2])

    
    
hover = HoverTool(tooltips = [("User", "@index")])

plot = figure(title="Wiki Edits Network with 'Undid' edits", x_range=(-5,5), y_range=(-5,5),tools=[hover,TapTool(),
    PanTool(),BoxZoomTool(),ResetTool()])
plotGraph = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plotGraph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='a5cd1c0a-a46d-4ce9-8a1f-1998e92aa599', ...)]
