In [48]:
import numpy as np
import pandas
import re
from bokeh.plotting import figure
from bokeh.io import output_notebook,output_file, show,install_notebook_hook
from bokeh.models import ColumnDataSource, Select,LabelSet, HoverTool, DatetimeAxis, TapTool, CustomJS, BoxZoomTool, PanTool
from bokeh.models import WheelZoomTool, UndoTool, RedoTool, ResetTool, ZoomInTool, ZoomOutTool, Axis, Text, Circle, MultiLine
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans,DBSCAN
import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bokeh.models.widgets import DataTable, TableColumn, Div, RangeSlider, Slider, Select
from bokeh.layouts import Row,widgetbox, Column
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.palettes import Category10
import math
from collections import Counter
import networkx as nx
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges, EdgesAndLinkedNodes
import copy

In [49]:
red = "#C91E17"
green = "#17C957"
grey = "#877C77"

def getDefaultTools():
    return[BoxZoomTool(),PanTool(),WheelZoomTool(), UndoTool(), RedoTool(), ResetTool(), ZoomInTool()]

def findWholeWord(w):
    "find whole word in text"
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

def appendRow(row, dataframe):
    "add row to dataframe"
    dataframeLength = len(dataframe)
    for key,value in row[1].iteritems():
        dataframe.loc[dataframeLength,key] = value
        
bots = ['BakBOT','SilviaBot','Chkbot']
    

In [50]:
file = open('data/Paraiso Edits.txt','r', errors='ignore')

lines = file.readlines()
#remove the first line
lines = lines[3:]
data = pandas.DataFrame(columns=['ID','timestamp','user','minorEdit','pageLength','comment','entireEdit'])
id=0
for line in lines:
        entireEdit = line
        tokens = re.split('\(|\)',line)
        timestampAndName = tokens[4]
        timestampAndName = timestampAndName.split(' ')
        name = timestampAndName[-2]
        timestamp = " ".join(timestampAndName[0:-2])
        timestamp = pandas.to_datetime(timestamp)
        m = tokens[6]
        m = True if (m==' m ') else False
        pageLength = tokens[7].split(' ')[0]
        pageLength = pageLength.replace(',','')
        if(pageLength.isdigit()):
            pageLength = int(pageLength)
            if(len(tokens) >= 10):
                comment = tokens[9]
            else:
                comment = ""
        else:
            pageLength = -1
            comment = tokens[7]
        id += 1
        data.loc[len(data)]=[id,timestamp,name,m,pageLength,comment,entireEdit]
        
print('total Rows:',len(data))

total Rows: 1009


In [51]:
# sort data with timestamp in ascending order
data = data.sort_values('timestamp',ascending=True)
previousPageLength = 0
previousEdit = -1
for index,row in data.iterrows():
    data.loc[index,'previousEdit'] = previousEdit
    previousEdit = row['ID']
    if(row['pageLength'] == -1):
        data.loc[index,'editDiff'] = 0
    else:
        data.loc[index,'editDiff'] = row['pageLength'] - previousPageLength
        previousPageLength = row['pageLength']

In [52]:
editLengths = []
for item in data['editDiff'].iteritems():
    editLengths.append(abs(item[1]))

counter = pandas.Series(Counter(editLengths))
top = list(counter.values)
left = counter.index
p1 = figure(title="Edit Size Distribution")
p1.circle(y=top, x=left)
p1.xaxis[0].formatter.use_scientific = False
p1.xaxis.axis_label = 'Bytes'
p1.yaxis.axis_label = 'Count'
output_notebook()
show(p1)

We can see that there is a gap in the size distribution. Thereofore, we will pick all edits above the size of 60000 bytes for our first network.

In [53]:
bigEdits = data.loc[abs(data['editDiff'])>60000]
bigEdits

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff
1008,1009,2006-08-11 16:42:00,Gerardmd,False,99907,Clean up refs a little,"# (cur) (last) 16:42, 11 August 2006 Gerardmd ...",-1.0,99907.0
916,917,2006-08-25 17:04:00,Alejandrosanchez,True,20,?Replaced page with 'Paraiso is BULL',"# (cur) (last) 17:04, 25 August 2006 Alejandro...",918.0,-101047.0
915,916,2006-08-25 17:04:00,BakBOT,False,101067,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 17:04, 25 August 2006 BakBOT (T...",917.0,101047.0
875,876,2006-08-27 20:16:00,201.226.51.x,False,68,?Replaced page with 'Paraiso believes that hum...,"# (cur) (last) 20:16, 27 August 2006 201.226.5...",877.0,-100174.0
874,875,2006-08-27 20:17:00,BakBOT,False,100242,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 20:17, 27 August 2006 BakBOT (T...",876.0,100174.0
846,847,2006-08-28 21:20:00,Honoratas,False,69,?Replaced page with 'Paraiso: A made up religi...,"# (cur) (last) 21:20, 28 August 2006 Honoratas...",848.0,-93138.0
845,846,2006-08-28 21:21:00,Alano,True,93207,Reverted edits by Honoratas,"# (cur) (last) 21:21, 28 August 2006 Alano (Ta...",847.0,93138.0
844,845,2006-08-28 21:25:00,Honoratas,False,103,?Replaced page with '{{ParaisoSeries}} '''Para...,"# (cur) (last) 21:25, 28 August 2006 Honoratas...",846.0,-93104.0
842,843,2006-08-28 21:37:00,Ginebra,False,93207,,"# (cur) (last) 21:37, 28 August 2006 Ginebra (...",844.0,93088.0
830,831,2006-08-30 02:56:00,71.59.210.x,False,17,?Replaced page with 'DEFINITION : CULT',"# (cur) (last) 02:56, 30 August 2006 71.59.210...",832.0,-93189.0


I remove the first row, the edit by user **Gerardmd** since it is the initial edit and therefore is not a case of vandalism or reverting vandalism.

In [54]:
bigEdits = bigEdits[1:]
bigEdits

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff
916,917,2006-08-25 17:04:00,Alejandrosanchez,True,20,?Replaced page with 'Paraiso is BULL',"# (cur) (last) 17:04, 25 August 2006 Alejandro...",918.0,-101047.0
915,916,2006-08-25 17:04:00,BakBOT,False,101067,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 17:04, 25 August 2006 BakBOT (T...",917.0,101047.0
875,876,2006-08-27 20:16:00,201.226.51.x,False,68,?Replaced page with 'Paraiso believes that hum...,"# (cur) (last) 20:16, 27 August 2006 201.226.5...",877.0,-100174.0
874,875,2006-08-27 20:17:00,BakBOT,False,100242,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 20:17, 27 August 2006 BakBOT (T...",876.0,100174.0
846,847,2006-08-28 21:20:00,Honoratas,False,69,?Replaced page with 'Paraiso: A made up religi...,"# (cur) (last) 21:20, 28 August 2006 Honoratas...",848.0,-93138.0
845,846,2006-08-28 21:21:00,Alano,True,93207,Reverted edits by Honoratas,"# (cur) (last) 21:21, 28 August 2006 Alano (Ta...",847.0,93138.0
844,845,2006-08-28 21:25:00,Honoratas,False,103,?Replaced page with '{{ParaisoSeries}} '''Para...,"# (cur) (last) 21:25, 28 August 2006 Honoratas...",846.0,-93104.0
842,843,2006-08-28 21:37:00,Ginebra,False,93207,,"# (cur) (last) 21:37, 28 August 2006 Ginebra (...",844.0,93088.0
830,831,2006-08-30 02:56:00,71.59.210.x,False,17,?Replaced page with 'DEFINITION : CULT',"# (cur) (last) 02:56, 30 August 2006 71.59.210...",832.0,-93189.0
829,830,2006-08-30 02:57:00,BakBOT,False,93206,Reverting possible vandalism by Special:Contri...,"# (cur) (last) 02:57, 30 August 2006 BakBOT (T...",831.0,93189.0


We will add positive or negative column to the bigEdits data frame depending on the editDiff.

In [55]:
bigEdits = bigEdits[bigEdits.user != 'BakBOT']
bigEdits.is_copy = False
for index,row in bigEdits.iterrows():
    if(row['editDiff'] < 0):
        bigEdits.loc[index,'faction'] = 'negative'
    else:
        bigEdits.loc[index,'faction'] = 'positive'
        
bigEdits

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff,faction
916,917,2006-08-25 17:04:00,Alejandrosanchez,True,20,?Replaced page with 'Paraiso is BULL',"# (cur) (last) 17:04, 25 August 2006 Alejandro...",918.0,-101047.0,negative
875,876,2006-08-27 20:16:00,201.226.51.x,False,68,?Replaced page with 'Paraiso believes that hum...,"# (cur) (last) 20:16, 27 August 2006 201.226.5...",877.0,-100174.0,negative
846,847,2006-08-28 21:20:00,Honoratas,False,69,?Replaced page with 'Paraiso: A made up religi...,"# (cur) (last) 21:20, 28 August 2006 Honoratas...",848.0,-93138.0,negative
845,846,2006-08-28 21:21:00,Alano,True,93207,Reverted edits by Honoratas,"# (cur) (last) 21:21, 28 August 2006 Alano (Ta...",847.0,93138.0,positive
844,845,2006-08-28 21:25:00,Honoratas,False,103,?Replaced page with '{{ParaisoSeries}} '''Para...,"# (cur) (last) 21:25, 28 August 2006 Honoratas...",846.0,-93104.0,negative
842,843,2006-08-28 21:37:00,Ginebra,False,93207,,"# (cur) (last) 21:37, 28 August 2006 Ginebra (...",844.0,93088.0,positive
830,831,2006-08-30 02:56:00,71.59.210.x,False,17,?Replaced page with 'DEFINITION : CULT',"# (cur) (last) 02:56, 30 August 2006 71.59.210...",832.0,-93189.0,negative
807,808,2006-08-31 06:23:00,204.52.215.x,False,209,?Replaced page with '{{ParaisoSeries}}'''Parai...,"# (cur) (last) 06:23, 31 August 2006 204.52.21...",809.0,-91435.0,negative
802,803,2006-08-31 12:59:00,75.179.21.x,False,38,?Replaced page with 'is stupid you idiots',"# (cur) (last) 12:59, 31 August 2006 75.179.21...",802.0,-91606.0,negative
799,800,2006-08-31 13:01:00,Hispa,True,91644,Reverted 1 edit by 75.179.21.x identified as v...,"# (cur) (last) 13:01, 31 August 2006 Hispa (Ta...",801.0,91606.0,positive


In [56]:
bigEditUsers = set(list(bigEdits['user']))
fillColor = []
size = []
graph = nx.Graph()
graph.add_node('negative')
fillColor.append(red)
size.append(20)
# graph.add_node('positive')
# fillColor.append(green)
# size.append(20)


for user in bigEditUsers:
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    faction = list(userRows['faction'])[0]
    if(faction  == 'negative'):
        fillColor.append(red)
        graph.add_edge(user, 'negative', weight=1 )
        size.append(10)
    else:
        fillColor.append(green)
#         graph.add_edge(user, 'positive', weight=1 )
        size.append(10)

prevUser = 'none'
# graph.add_edge('negative', 'positive', weight=-500 )
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=-10 ) 
    prevUser = row[1]['user']
    
hover = HoverTool(tooltips = [("User", "@index")])
plot = figure(title="Wiki Edits Network", x_range=(-5,5), y_range=(-5,5),tools=[hover] + getDefaultTools())
plotGraph = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='64bc6430-4025-48a0-930a-cc9c526d288b', ...)]


Get all reverts, undos and user mentions:

In [57]:
usernames = list(set(list(data['user'])))
len(usernames)

387

In [58]:
smallEdits = data.loc[abs(data['editDiff'])<=60000]
conflictRows = pandas.DataFrame(columns=list(data.columns.values))
conflictRowsUserMentions = pandas.DataFrame(columns=list(data.columns.values))
conflictRowsNoUserMentions = pandas.DataFrame(columns=list(data.columns.values))

for row in smallEdits.iterrows():
    rowAdded = False
    for user in usernames:
        if(findWholeWord(user)(row[1]['comment'])):
            appendRow(row, conflictRows)
            appendRow(row,conflictRowsUserMentions)
            rowAdded = True
            break
    if(rowAdded == False):
        if(findWholeWord("rv")(row[1]['comment']) or findWholeWord("rv.")(row[1]['comment']) or findWholeWord("undid")(row[1]['comment'])):
            appendRow(row, conflictRows)
            appendRow(row,conflictRowsNoUserMentions)

print(len(conflictRowsUserMentions),len(conflictRowsNoUserMentions))

228 22


In [59]:
conflictRows.head()

Unnamed: 0,ID,timestamp,user,minorEdit,pageLength,comment,entireEdit,previousEdit,editDiff
0,1005,2006-08-12 01:47:00,Gracia,False,99909,rv - vandalism,"# (cur) (last) 01:47, 12 August 2006 Gracia (T...",1006,-53
1,1003,2006-08-14 02:30:00,Savanna,False,99890,Handling DailosTamanca's objection to my earli...,"# (cur) (last) 02:30, 14 August 2006 Savanna (...",1004,-17
2,994,2006-08-14 16:44:00,VictoriaV,True,100699,Reverted to revision xxxxxxxxx by Savanna; pri...,"# (cur) (last) 16:44, 14 August 2006 VictoriaV...",995,-493
3,988,2006-08-14 19:06:00,Sara,False,100700,rv. pp. 112 - 130,"# (cur) (last) 19:06, 14 August 2006 Sara (Tal...",989,141
4,986,2006-08-14 19:53:00,Agustin,True,100700,Undid revision xxxxxxxxx by KesheR,"# (cur) (last) 19:53, 14 August 2006 Agustin (...",987,-77


In [60]:
for row in conflictRows.iterrows():
    if('bot' in row[1]['entireEdit'].lower()):
        print(row[1]['entireEdit'])

# (cur) (last) 02:30, 14 August 2006 Savanna (Talk | contribs) (99,890 bytes) (Handling DailosTamanca's objection to my earlier edit. Including citations I deleted and combining into one phrase presenting both viewpoints.)

# (cur) (last) 04:34, 15 August 2006 Agustin (Talk | contribs) m (100,849 bytes) (Undid revision xxxxxxxxx by VictoriaV (talk) My changes remove POV from both sides and you must be way over 3RR for this section of the ar)

# (cur) (last) 00:07, 26 August 2006 Rosario (Talk | contribs) m (101,067 bytes) (Reverted edits by 64.25.200.x (talk) to last revision (xxxxxxxxx) by BakBOT using VP)

# (cur) (last) 03:19, 31 August 2006 BakBOT (Talk | contribs) (91,645 bytes) (Reverting possible vandalism by Special:Contributions/24.168.142.x (see here). If this is a mistake, report it. Thanks, BakBOT. (Bot))

# (cur) (last) 03:26, 31 August 2006 Rosalinda (Talk | contribs) m (91,645 bytes) (Reverted 1 edit by 24.168.142.x identified as vandalism to last revision by BakBOT. usi

Identified bots are:
1. BakBOT
2. SilviaBot
3. Chkbot

In [61]:
conflictRows = conflictRows.loc[~conflictRows['user'].isin(bots)]
len(conflictRows)

234

In [62]:
def modify_doc(doc):
    def update(attr, old, new):
        selectedIndex = source.selected['1d']['indices'][0]
        comment = source.data['comment'][selectedIndex]
        pageLength =  source.data['pageLength'][source.selected['1d']['indices'][0]]
        editDiff =  source.data['editDiff'][source.selected['1d']['indices'][0]]
        previousEditIndex = source.data['previousEdit'][source.selected['1d']['indices'][0]]
        previousEdit = data.loc[data['ID'] == previousEditIndex]
        
        tokens = re.split(" ",comment)
        editedComment = ""
        for token in tokens:
            tokenAdded = False
            for user in usernames:
                if(user in token):
                    editedComment = editedComment +" <span style='background:#FFD6E1'>" + token + "</span>"
                    tokenAdded = True
                    break;
            if(not tokenAdded):
                editedComment = editedComment +" " +  token
            
        layout.children[1].children[1] = Div(text="<p style='padding-left:100px;'>"+"<b>ID:</b>"+str(source.data['ID'][selectedIndex])+"</p>"
                                 +"<p style='padding-left:100px;'>"+"<b>User:</b>"+source.data['user'][selectedIndex]+"</p>"
                                 +"<p style='padding-left:100px;'><b>Comment:</b>"+editedComment+"</p>"
                                 +"<p style='padding-left:100px;'><b>Page Length:</b>"+str(source.data['pageLength'][selectedIndex])+"</p>"
                                 +"<p style='padding-left:100px;'><b>Edit Diff:</b>"+str(source.data['pageLength'][selectedIndex])+"</p>"
                                 +"<p style='padding-left:100px;'><b>Previous comment:</b>"+previousEdit['comment'].item()+"</p>"
                                 +"<p style='padding-left:100px;'><b>Previous Page Length:</b>"+str(previousEdit['pageLength'].item())+"</p>"
                                 +"<p style='padding-left:100px;'><b>Previous Edit Diff:</b>"+str(previousEdit['editDiff'].item())+"</p>"
                                             ,width=400, height=100)
     
    
    def updateTable(attr,old,new):
        if(groupSelect.value == 'All'):
            selectedDF = conflictRows
        elif(groupSelect.value == 'User Mentions'):
            selectedDF = conflictRowsUserMentions
        else:
            selectedDF = conflictRowsNoUserMentions
            
        source.data = dict(
            ID = selectedDF['ID'],
            comment=selectedDF['comment'],
            user=selectedDF['user'],
            pageLength= selectedDF['pageLength'],
            editDiff = selectedDF['editDiff'],
            previousEdit = selectedDF['previousEdit']
        )
        layout.children[1].children[0] = create_table()
        
    source = ColumnDataSource(data = dict(
            ID = conflictRows['ID'],
            comment=conflictRows['comment'],
            user=conflictRows['user'],
            pageLength= conflictRows['pageLength'],
            editDiff = conflictRows['editDiff'],
            previousEdit = conflictRows['previousEdit']
    ))
    source.on_change('selected', update)
    
    groupSelect = Select(title="Select Group:", value="All", options=["All", "User Mentions", "No User Mentions"])
    groupSelect.on_change('value',updateTable)
    
    def create_table():
        
        columns = [
            TableColumn(field="comment", title="Comment"),
            TableColumn(field="user", title="User"),
        ]
        data_table = DataTable(source=source, columns=columns, selectable=True, width= 500)
        return data_table
    div = Div(text="",width=400, height=100)
    table = create_table()
    layout = Column(groupSelect, Row(table,div))
    doc.add_root(layout)

output_notebook()    
handler = FunctionHandler(modify_doc)
app = Application(handler)
show(app,notebook_url='localhost:8888')


We ignore the following comments because they are reverted twice thus cancelling the previous revert and in the last case, we are not able to find the edit talked about.                                            

In [63]:
ignoredIds = [785,784,530,534,488,489,336,260,80]
ignoredComments = data.loc[data['ID'].isin(ignoredIds)]
for edit in list(ignoredComments.entireEdit):
    print(edit)


# (cur) (last) 01:26, 1 September 2006 Sara (Talk | contribs) (92,102 bytes) (rv. the new source doesn't even mention Paraiso and rather contradicts the claim)

# (cur) (last) 01:36, 1 September 2006 Sara (Talk | contribs) (92,025 bytes) (rv myself. dammit it does in a table ):)

# (cur) (last) 05:55, 16 September 2006 Sara (Talk | contribs) (97,261 bytes) (rv Sofia's last edit wich he labeled with "Pope statement" but deleted conclusions from the German Government including a source)

# (cur) (last) 09:01, 16 September 2006 Sara (Talk | contribs) (97,555 bytes) (rv myself, didn't see that this was part of the lead. sorry)

# (cur) (last) 02:28, 21 September 2006 Sara (Talk | contribs) (97,869 bytes) (Undid revision xxxxxxxxx by RyogaNica (talk) don't mark edits as minor if you delete source)

# (cur) (last) 02:40, 21 September 2006 Sara (Talk | contribs) (97,692 bytes) (rv. myself I see now, you doubled it before... ok ): , not my day)

# (cur) (last) 11:24, 26 October 2006 Sara (Talk

In [64]:
conflictRows = conflictRows.loc[~conflictRows['ID'].isin(ignoredIds)]
len(conflictRows)

225

Extract Features from the conflict rows:

In [65]:
vectorizer = TfidfVectorizer(stop_words = 'english',analyzer  = 'word',min_df = 1, max_df = 1.0)
tfidf_result = vectorizer.fit_transform(list(conflictRows['comment']))

def getScores(vectorizer, tfidf_result):
    allFeatures = vectorizer.get_feature_names()
    scores = zip(allFeatures,np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return sorted_scores


featureScores = getScores(vectorizer, tfidf_result)
selectedFeatures = []
for score in featureScores:
    if(not re.compile("^[0-9]*$").match(score[0])):
        if(score[1] > 1):
            selectedFeatures.append(score)

featureList = []
for feature in selectedFeatures:
    featureList.append(feature[0])
featureList.append('All')

len(selectedFeatures)

54

In [66]:
def modify_doc2(doc):
    def update(attr, old, new):
        selectedIndex = source.selected['1d']['indices'][0]
        comment = source.data['comment'][selectedIndex]
        ID = source.data['ID'][selectedIndex]
        selectedUser = source.data['user'][selectedIndex]
        pageLength =  source.data['pageLength'][selectedIndex]
        editDiff =  source.data['editDiff'][selectedIndex]
        
        tokens = re.split(" ",comment)
        editedComment = ""
        for token in tokens:
            tokenAdded = False
            for user in usernames:
                if(user in token):
                    editedComment = editedComment +" <span style='background:#FFD6E1'>" + token + "</span>"
                    tokenAdded = True
                    break;
            if(not tokenAdded):
                for feature in featureList:
                    if(feature in token.lower()):
                        if(feature == featureSelect.value):
                            editedComment = editedComment +" <span style='background:#9EFFB7;'>" + token + "</span>"
                        else:
                            editedComment = editedComment +" <span style='background:#B0E8FF'>" + token + "</span>"
                        tokenAdded = True
                        break;
            if(not tokenAdded):
                editedComment = editedComment +" " +  token
            
        layout.children[1].children[1] = Div(text="<p style='padding-left:100px;'>"+"<b>ID:</b>"+str(ID)+"</p>"
                                 +"<p style='padding-left:100px;'>"+"<b>User:</b>"+selectedUser+"</p>"
                                 +"<p style='padding-left:100px;'><b>Comment:</b>"+editedComment+"</p>"
                                 +"<p style='padding-left:100px;'><b>Page Length:</b>"+str(pageLength)+"</p>"
                                 +"<p style='padding-left:100px;'><b>Edit Diff:</b>"+str(editDiff)+"</p>"
                                ,width=400, height=100)
     
    
    def updateTable(attr,old,new):
        if(featureSelect.value == 'All'):
            source.data = dict(
                ID = conflictRows['ID'],
                comment=conflictRows['comment'],
                user=conflictRows['user'],
                pageLength= conflictRows['pageLength'],
                editDiff = conflictRows['editDiff'],
                previousEdit = conflictRows['previousEdit']
            )
        else:
            selectedRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
            for row in conflictRows.iterrows():
                if(findWholeWord(featureSelect.value)(row[1]['comment'])):
                        currentRow = len(selectedRows)
                        for key,value in row[1].iteritems():
                            selectedRows.loc[currentRow,key] = value
            source.data = dict(
                ID = selectedRows['ID'],
                comment=selectedRows['comment'],
                user=selectedRows['user'],
                pageLength= selectedRows['pageLength'],
                editDiff = selectedRows['editDiff'],
                previousEdit = selectedRows['previousEdit']
            )
        layout.children[1].children[0] = create_table()
        
    source = ColumnDataSource(data = dict(
            ID = conflictRows['ID'],
            comment=conflictRows['comment'],
            user=conflictRows['user'],
            pageLength= conflictRows['pageLength'],
            editDiff = conflictRows['editDiff'],
            previousEdit = conflictRows['previousEdit']
    ))
    source.on_change('selected', update)
    
    featureSelect = Select(title="Select Feature:", value="All", options=featureList)
    featureSelect.on_change('value',updateTable)
    
    def create_table():
        
        columns = [
            TableColumn(field="comment", title="Comment"),
            TableColumn(field="user", title="User"),
        ]
        data_table = DataTable(source=source, columns=columns, selectable=True, width= 500)
        return data_table
    div = Div(text="",width=400, height=100)
    table = create_table()
    layout = Column(featureSelect, Row(table,div))
    doc.add_root(layout)

output_notebook()    
handler2 = FunctionHandler(modify_doc2)
app2 = Application(handler2)
show(app2,notebook_url='localhost:8888')

### Weight Calculation
#### Word : Undid/Undo

In [67]:
weights = {'revert':-3,'accusation':-1,'node':10,'vandalism':-2, 'for':5, 'goodFaith':3}

In [68]:
undidRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('undid')(row[1]['comment']) or findWholeWord('undo')(row[1]['comment'])):
        appendRow(row, undidRows)

len(undidRows)  

85

In [69]:
print('Rows with more than one username mentions: ')
for row in undidRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    count = 0
    for token in tokens:
        for user in usernames:
            if(user in token):
                count+=1
                break
    if(count > 1):
        print(row[1]['comment'])

Rows with more than one username mentions: 
Undid revision xxxxxxxxx by VictoriaV reverted link butchered by VictoriaV


In [70]:
edges = []
def addEdge(user1,user2, weight,edit=None):
    global edges
    if(user1 in bots or user2 in bots):
        return
    edgeFound = False
    for edge in edges:
        if((edge[0] == user1 and edge[1] == user2) or (edge[0] == user2 and edge[1] == user1)):
            edge[2] += weight
            edge[3].append(edit)
            edgeFound = True
    if(not edgeFound):
            edges.append([user1,user2,weight,[edit]])
        

In [71]:
for index,row in undidRows.iterrows():
    tokens = re.split(" ",row['comment'])
    edgeAdded = False
    for token in tokens:
        for user in usernames:
            if(user.lower() in token.lower()):
                addEdge(row['user'],user,weights['revert'],undidRows.iloc[index])
                if(findWholeWord('pov')(row['comment'])):
                    addEdge(row['user'],user,weights['accusation'])
                edgeAdded = True
                break
        if(edgeAdded):
            break
    if(not edgeAdded):
        previousEdit = data.loc[data['ID'] == row['previousEdit']]
        addEdge(row['user'],previousEdit['user'].item(),weights['revert'],undidRows.iloc[index])
        if(findWholeWord('pov')(row['comment'])):
            addEdge(row['user'],previousEdit['user'].item(),weights['accusation'])


In [72]:
graph = nx.Graph()
fillColor = []
size = []
# graph.add_node('negative')
# fillColor.append(red)
# size.append(20)
# graph.add_node('positive')
# fillColor.append(green)
# size.append(20)
# graph.add_edge('positive', 'negative', weight=-500 )
users = list(bigEdits['user'])
for edge in edges:
    users.append(edge[0])
    users.append(edge[1])
users = list(set(users))
for user in users:
    if(user in bots):
        continue
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    if(len(userRows)):
        faction = list(userRows['faction'])[0]
        if(faction  == 'negative'):
            fillColor.append(red)
#             graph.add_edge(user, 'negative', weight=weights['node'] )
            size.append(10)
        else:
            fillColor.append(green)
#             graph.add_edge(user, 'positive', weight=weights['node'] )
            size.append(10)
    else:
        fillColor.append(grey)
        size.append(10)
        
prevUser = 'none'
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=weights['revert'] )
    prevUser = row[1]['user']
    
for edge in edges:
    graph.add_edge(edge[0],edge[1],weight = edge[2])

    
    
hover = HoverTool(tooltips = [("User", "@index")])

plot = figure(title="Wiki Edits Network with 'Undid' edits", x_range=(-5,5), y_range=(-5,5),tools=[hover,TapTool(),
    PanTool(),BoxZoomTool(),ResetTool()])
plotGraph = from_networkx(graph, nx.spring_layout, scale=4, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plotGraph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='3842dc3b-6194-4e16-9c5c-be4b62bac949', ...)]


#### Word : Reverted/Revert

In [73]:
revertRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if((findWholeWord('reverted')(row[1]['comment']) or
      findWholeWord('revert')(row[1]['comment']) or
      findWholeWord('rv')(row[1]['comment']) or
      findWholeWord('rvt')(row[1]['comment']) or
      findWholeWord('reverting')(row[1]['comment'])) and
      not findWholeWord('undid')(row[1]['comment'])):
        appendRow(row,revertRows)
        
len(revertRows)

126

In [74]:
print('comments that do not start with the word revert or rv:')
for row in revertRows.iterrows():
    tokens = re.split(" ",row[1]['comment'])
    if('revert' not in tokens[0].lower() and 'rv' != tokens[0].lower()[0:2]):
        print(row[1]['comment'])

comments that do not start with the word revert or rv:
partly reverted RyogaNica. ... not all "critical" sites are personal sites
missed some of 66.66.125.x?'s unsourced POV-pushing earlier. Rv back to Agustin's version.


In [75]:
for index, row in revertRows.iterrows():
    tokens = re.split(" ",row['comment'])
    revertFound = False
    toFound = False
    editsReverted = 0
    forUser=''
    againstUser = ''
    for tokenIndex,token in enumerate(tokens):
        if(not revertFound):
            if('revert' in token.lower() or 'rv' != token.lower()[0:2]):
                revertFound = True
            else:
                for user in usernames:
                    if(user.lower() in token.lower()):
                        againstUser = user
        else:
            if('to' == token):
                toFound = True
            elif(re.compile("^[0-9]+$").match(token)):
                if(tokenIndex < len(tokens) - 1):
                    if('edit' in tokens[tokenIndex+1].lower()):
                        editsReverted = int(token)
            else:
                for user in usernames:
                    if(user.lower() in token.lower()):
                        if(toFound and forUser == ''):
                            forUser = user
                        elif(againstUser == ''):
                            againstUser = user
    if(againstUser == ''):
        if(forUser == ''):
            toEdit = data.loc[data['ID'] == (row['ID'] + 2 )]
            removedEdit = data.loc[data['ID'] == (row['ID'] + 1 )]
            if(findWholeWord('vandalism')(row['comment'])):
                addEdge(row['user'],removedEdit['user'].item(),weights['vandalism'])
            if(findWholeWord('pov')(row['comment']) or findWholeWord('unsourced')(row['comment'])):
                addEdge(row['user'],removedEdit['user'].item(),weights['accusation'])
            addEdge(row['user'],removedEdit['user'].item(),weights['revert'],revertRows.iloc[index])
            if(row['pageLength'] == toEdit['pageLength'].item()):
                addEdge(row['user'],toEdit['user'].item(),weights['for'],revertRows.iloc[index])
        else:
            previousCommitsByForUser = data.loc[(data['ID']>row['ID']) & (data['user'] == forUser) & (data['pageLength'] == row['pageLength'])]
            previousCommitIndex = previousCommitsByForUser['ID'].min()
            commitsInBetween = data.loc[(data['ID']>row['ID']) & (data['ID'] < previousCommitIndex)]
            for user in list(commitsInBetween['user']):
                if(findWholeWord('vandalism')(row['comment'])):
                    addEdge(row['user'],user,weights['vandalism'])
                if(findWholeWord('pov')(row['comment']) or findWholeWord('unsourced')(row['comment'])):
                    addEdge(row['user'],user,weights['accusation'])
                addEdge(row['user'],user,weights['revert'],revertRows.iloc[index])
            addEdge(row['user'],forUser,weights['for'],revertRows.iloc[index])
    elif(forUser == ''):
        if(editsReverted !=0):
            toEdit = data.loc[data['ID'] == (row['ID'] + 1 + editsReverted)]
            forUser = toEdit['user'].item()
            if(findWholeWord('vandalism')(row['comment'])):
                addEdge(row['user'],againstUser,weights['vandalism'] * editsReverted)
            if(findWholeWord('pov')(row['comment']) or findWholeWord('unsourced')(row['comment'])):
                addEdge(row['user'],againstUser,weights['accusation'] * editsReverted)
            addEdge(row['user'],againstUser,weights['revert'] * editsReverted,revertRows.iloc[index])
            addEdge(row['user'],forUser,weights['for'],revertRows.iloc[index])
        else:
            toEdit = data.loc[data['ID'] == (row['ID'] + 2 )]
            if('good faith' in row['comment']):
                addEdge(row['user'],againstUser,weights['goodFaith'])
            if(findWholeWord('vandalism')(row['comment'])):
                addEdge(row['user'],againstUser,weights['vandalism'])
            if(findWholeWord('pov')(row['comment']) or findWholeWord('unsourced')(row['comment'])):
                addEdge(row['user'],againstUser,weights['accusation'])
            addEdge(row['user'],againstUser,weights['revert'],revertRows.iloc[index])
            if(row['pageLength'] == toEdit['pageLength'].item()):
                addEdge(row['user'],toEdit['user'].item(),weights['for'],revertRows.iloc[index])
    else:
        if(editsReverted !=0):
            if(findWholeWord('vandalism')(row['comment'])):
                addEdge(row['user'],againstUser,weights['vandalism'] * editsReverted)
            if(findWholeWord('pov')(row['comment']) or findWholeWord('unsourced')(row['comment'])):
                addEdge(row['user'],againstUser,weights['accusation'] * editsReverted)
            addEdge(row['user'],againstUser,weights['revert'] * editsReverted,revertRows.iloc[index])
            addEdge(row['user'],forUser,weights['for'],revertRows.iloc[index])
        else:
            if(findWholeWord('vandalism')(row['comment'])):
                addEdge(row['user'],againstUser,weights['vandalism'])
            if(findWholeWord('pov')(row['comment']) or findWholeWord('unsourced')(row['comment'])):
                addEdge(row['user'],againstUser,weights['accusation'])
            addEdge(row['user'],againstUser,weights['revert'],revertRows.iloc[index])
            addEdge(row['user'],forUser,weights['for'],revertRows.iloc[index])


In [76]:
graph = nx.Graph()
fillColor = []
size = []
graph.add_node('negative')
fillColor.append(red)
size.append(20)
# graph.add_node('positive')
# fillColor.append(green)
# size.append(20)
# graph.add_edge('positive', 'negative', weight=-500 )
users = list(bigEdits['user'])
for edge in edges:
    users.append(edge[0])
    users.append(edge[1])
users = list(set(users))
for user in users:
    if(user in bots):
        continue
    graph.add_node(user)
    userRows = bigEdits.loc[bigEdits['user'] == user]
    if(len(userRows)):
        faction = list(userRows['faction'])[0]
        if(faction  == 'negative'):
            fillColor.append(red)
            graph.add_edge(user, 'negative', weight=10 )
            size.append(10)
        else:
            fillColor.append(green)
#             graph.add_edge(user, 'positive', weight=10 )
            size.append(10)
    else:
        fillColor.append(grey)
        size.append(10)
        
prevUser = 'none'
for row in bigEdits.iterrows():
    if(row[1]['faction'] == 'positive'):
        graph.add_edge(row[1]['user'], prevUser, weight=-10 )
    prevUser = row[1]['user']
    
for edge in edges:
    graph.add_edge(edge[0],edge[1],weight = edge[2])

    
    
hover = HoverTool(tooltips = [("User", "@index")])

plot = figure(title="Wiki Edits Network with 'Undid' edits", x_range=(-5,5), y_range=(-5,5),tools=[hover,TapTool(),
    PanTool(),BoxZoomTool(),ResetTool()])
plotGraph = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
plotGraph.node_renderer.data_source.data['fill_color'] = fillColor
plotGraph.node_renderer.data_source.data['size'] = size
plotGraph.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')
plotGraph.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
plot.renderers.append(plotGraph)
show(plot)

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='ad0eb6cb-6fcc-429b-ab05-90707b1c0408', ...)]


 #### Word: Vandalism

In [77]:
revertIds = list(revertRows['ID'])
vandalismRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('vandalism')(row[1]['comment']) and row[1]['ID'] not in revertIds):
        currentRow = len(vandalismRows)
        for key,value in row[1].iteritems():
            vandalismRows.loc[currentRow,key] = value

len(vandalismRows)

2

In [78]:
list(vandalismRows['comment'])

['restored article to edit before vandalism/edit-warring by 66.66.125.x',
 'Removed blatant vandalism from lead Pablo6213: "Paraiso is a lie......"']

In [79]:
for index,row in vandalismRows.iterrows():
    tokens = re.split(" ",row['comment'])
    for token in tokens:
        for user in usernames:
            if(user.lower() in token.lower()):
                addEdge(row['user'],user,weights['revert'] + weights['vandalism'],vandalismRows.iloc[index])

#### Word: Pov

In [80]:
undidIds = list(undidRows['ID'])
povRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('pov')(row[1]['comment']) and row[1]['ID'] not in revertIds and row[1]['ID'] not in undidIds):
        currentRow = len(povRows)
        for key,value in row[1].iteritems():
            povRows.loc[currentRow,key] = value

len(povRows)

1

In [81]:
list(povRows['comment'])

['Stop your POV pushing Rm99!!!!']

In [82]:
for index,row in povRows.iterrows():
    tokens = re.split(" ",row['comment'])
    for token in tokens:
        for user in usernames:
            if(user.lower() in token.lower()):
                addEdge(row['user'],user,weights['revert'] + weights['accusation'],povRows.iloc[index])

#### Word: Unsourced

In [83]:
unsourcedRows = pandas.DataFrame(columns=list(conflictRows.columns.values))
for row in conflictRows.iterrows():
    if(findWholeWord('unsourced')(row[1]['comment']) and row[1]['ID'] not in revertIds):
        currentRow = len(unsourcedRows)
        for key,value in row[1].iteritems():
            unsourcedRows.loc[currentRow,key] = value

len(unsourcedRows)

0

In [113]:
# graph.add_node('negative')
# fillColor.append(red)
# size.append(20)
# graph.add_node('positive')
# fillColor.append(green)
# size.append(20)
# graph.add_edge('positive', 'negative', weight=-500 )
#########################################################################################


def modify_doc3(doc):
    def wrapText(text):
        return "<p>"+text+"</p>"
    
    
    called = False
    def selectUser(user,dropdown):
        global graphRenderer
        entireText = ""
        entireText += "<h3>" + user + '</h3>'
        for edge in edges:
            if(user == edge[1] or user == edge[0]):
                if(user == edge[1]):
                    otherUser = edge[0]
                else:
                    otherUser = edge[1]
                entireText += "<div style='border:1px solid grey;'>"
                entireText += wrapText("<b>Relationship with "+otherUser+": </b>")
                entireText += wrapText("<b>Weight:</b> "+str(edge[2]))
                for edit in edge[3]:
                    if(type(edit) != type(None)):
                        entireText += wrapText("<b>user:</b> "+edit['user'])
                        tokens = re.split(" ",edit['comment'])
                        comment = ""
                        for token in tokens:
                            tokenAdded = False
                            for username in usernames:
                                if(username in token):
                                    comment += " <span style='background:#9EFFB7'>" + token + "</span>"
                                    tokenAdded = True
                                    break
                            if(not tokenAdded):
                                if(token.lower() == 'undid' or 
                                    token.lower() == 'undo' or
                                    token.lower() == 'reverted' or
                                    token.lower() == 'revert' or
                                    token.lower() == 'rv' or
                                    token.lower() == 'rvt' or
                                    token.lower() == 'reverting'):
                                    comment += " <span style='background:#FFD6E1'>" + token + "</span>"
                                elif(token.lower() == 'vandalism' or token.lower() == 'pov'):
                                    comment += " <span style='background:#B0E8FF'>" + token + "</span>"
                                else:
                                    comment += " " + token
                                    
                        entireText += wrapText("<b>comment:</b> "+comment)
                entireText += "</div>"
        layout.children[1].children[1] = Div(text= entireText,width=400, height=100)
        if(dropdown):
            userIndex = graphRenderer.node_renderer.data_source.data['index'].index(user)
            edgeIndices = []
            for index,username in enumerate(graphRenderer.edge_renderer.data_source.data['start']):
                if(username == user):
                    edgeIndices.append(index)
            for index,username in enumerate(graphRenderer.edge_renderer.data_source.data['end']):
                if(username == user):
                    edgeIndices.append(index)
            
            edgeIndices.sort()
            edgeObject = {}
            for edgeIndex in edgeIndices:
                edgeObject[str(edgeIndex)] = [0]
                
            if(len(graphRenderer.node_renderer.data_source.selected['1d']['indices'])== 0 or 
                graphRenderer.node_renderer.data_source.selected['1d']['indices'][0] != userIndex):
                
                graphRenderer.edge_renderer.data_source.selected['0d'] =  {'glyph': None, 'get_view': {}, 'indices': []}
                graphRenderer.edge_renderer.data_source.selected['1d']['indices'] = []
                graphRenderer.edge_renderer.data_source.selected['2d']['indices'] = edgeObject

                graphRenderer.node_renderer.data_source.selected['1d']['indices'] = [userIndex]
                copy.copy(graphRenderer.edge_renderer.data_source.selected)
                copy.copy(graphRenderer.node_renderer.data_source.selected) #triggers change for some reason
        else:
            if(userDropdown.value != user):
                userDropdown.value = user
    
    def update(attr, old, new):
        global graphRenderer
        selectedIndex = new['1d']['indices']
        if(len(selectedIndex) != 0):
            user = graphRenderer.node_renderer.data_source.data['index'][selectedIndex[0]]
            selectUser(user,False)
        else:
            layout.children[1].children[1] = Div(text="",width=400, height=100)
            
            
    def userDropdownCallback(attr, old, new):
        selectUser(userDropdown.value,True)
        
    dropdownUsers = list(bigEdits['user'])
    for edge in edges:
        users.append(edge[0])
        users.append(edge[1])
            
    dropdownUsers = list(set(users)) 
    
    userDropdown = Select(title='select user',options=dropdownUsers)
    userDropdown.on_change('value',userDropdownCallback)
    
     
    
    def plotGraph():
        global graphRenderer
        
        graph = nx.Graph()
        fillColor = []
        size = []
        users = list(bigEdits['user'])
        for edge in edges:
            users.append(edge[0])
            users.append(edge[1])
            
        users = list(set(users))  
        for user in users:
            if(user in bots):
                continue
            graph.add_node(user)
            userRows = bigEdits.loc[bigEdits['user'] == user]
            if(len(userRows)):
                faction = list(userRows['faction'])[0]
                if(faction  == 'negative'):
                    fillColor.append(red)
                    size.append(10)
                else:
                    fillColor.append(green)
                    size.append(10)
            else:
                fillColor.append(grey)
                size.append(10)
        prevUser = 'none'
        for row in bigEdits.iterrows():
            if(row[1]['faction'] == 'positive'):
                graph.add_edge(row[1]['user'], prevUser, weight=weights['revert'] )
            prevUser = row[1]['user']

        for edge in edges:
            graph.add_edge(edge[0],edge[1],weight = edge[2])
            
        hover = HoverTool(tooltips = [("User", "@index")])
        plot = figure(title="Wiki Edits Network with 'Undid' edits", x_range=(-5,5), y_range=(-5,5),tools=[hover,TapTool()] + getDefaultTools())
        graphRenderer = from_networkx(graph, nx.spring_layout, scale=5, center=(0,0))
        graphRenderer.node_renderer.data_source.data['fill_color'] = fillColor
        graphRenderer.node_renderer.data_source.data['size'] = size
        graphRenderer.node_renderer.glyph = Circle(size='size',fill_color='fill_color',line_color='fill_color')

        graphRenderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.8, line_width=1)
        graphRenderer.edge_renderer.selection_glyph = MultiLine(line_color="black", line_alpha=1, line_width=3)
        graphRenderer.node_renderer.data_source.on_change('selected',update)
        graphRenderer.selection_policy = NodesAndLinkedEdges()
        plot.renderers.append(graphRenderer)
        return plot
    
    div = Div(text="",width=400, height=100)
    plot = plotGraph()
    layout = Column(userDropdown,Row(plot,div))
    doc.add_root(layout)

handler3 = FunctionHandler(modify_doc3)
app3 = Application(handler3)
show(app3,notebook_url='localhost:8888')

ERROR:C:\ProgramData\Anaconda3\lib\site-packages\bokeh\core\validation\check.py:E-1001 (BAD_COLUMN_NAME): Glyph refers to nonexistent column name: fill_color, size [renderer: GlyphRenderer(id='6d178c2b-e59c-4c3e-9f0a-584ac7d1471b', ...)]


**Pro-tip:** For some reason, the edges are only correctly highlighted when one of the nodes is first clicked before selecting from the dropdown. 