## Initial data visualization

**Note:** I used another notebook for this visualization because the graph is too big to show inside the notebook like the graphs in the original notebook.

In this visualization, We will focus on visualizing the user-wise edits on a timeline. We will color the edits red if bytes were removed and blue if bytes were added in the edit. The size of the circles represent the number of bytes changed.

In [6]:
import numpy as np
import pandas
import re
from bokeh.plotting import figure
from bokeh.io import output_notebook, show,output_file,install_notebook_hook
from bokeh.models import ColumnDataSource, Select,LabelSet, HoverTool, DatetimeAxis, TapTool, CustomJS, BoxZoomTool, PanTool
from bokeh.models import WheelZoomTool, UndoTool, RedoTool, ResetTool, ZoomInTool, ZoomOutTool
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans,DBSCAN
import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bokeh.models.widgets import DataTable, TableColumn, Div, RangeSlider, Slider, Select
from bokeh.layouts import Row,widgetbox, Column
from sklearn.manifold import MDS
from sklearn.decomposition import TruncatedSVD
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from bokeh.palettes import Category10
import math

In [7]:
file = open('data/Paraiso Edits.txt','r', errors='ignore')

lines = file.readlines()
#remove the first line
lines = lines[3:]

data = pandas.DataFrame(columns=['timestamp','user','minorEdit','pageLength','comment','entireEdit'])
i=0
for line in lines:
        entireEdit = line
        tokens = re.split('\(|\)',line)
        timestampAndName = tokens[4]
        timestampAndName = timestampAndName.split(' ')
        name = timestampAndName[-2]
        timestamp = " ".join(timestampAndName[0:-2])
        timestamp = pandas.to_datetime(timestamp)
        m = tokens[6]
        m = True if (m==' m ') else False
        pageLength = tokens[7].split(' ')[0]
        pageLength = pageLength.replace(',','')
        if(pageLength.isdigit()):
            pageLength = int(pageLength)
            if(len(tokens) >= 10):
                comment = tokens[9]
            else:
                comment = ""
        else:
            pageLength = -1
            comment = tokens[7]
            
        data.loc[len(data)]=[timestamp,name,m,pageLength,comment,entireEdit]
        


Since this visualization works on the bytes added in edits, we will need to add another column to the dataframe (editDiff). We are assuming that the given data is complete and that none of the edits are omitted from the data.

In [8]:
data = data.sort_values('timestamp',ascending=True)
previousPageLength = 0
for index,row in data.iterrows():
    if(row['pageLength'] == -1):
        data.loc[index,'editDiff'] = 0
    else:
        data.loc[index,'editDiff'] = row['pageLength'] - previousPageLength
        previousPageLength = row['pageLength']

In [9]:
contributions = data['user'].value_counts()
blue = '#2E18F4'
red = '#FB412C' 
additions = pandas.DataFrame(columns=['timestamp','user','size','color','editDiff','comment'])
deletions = pandas.DataFrame(columns=['timestamp','user','size','color','editDiff','comment'])
users = contributions.index.tolist()
users = list(reversed(users))
maxDiff = max(data['editDiff'])
for row in data.iterrows():
    size = abs(row[1]['editDiff'] * 94 / maxDiff) + 6
    if(row[1]['editDiff'] < 0):
        deletions.loc[len(deletions)] = [row[1]['timestamp'],row[1]['user'],size,red,row[1]['editDiff'],row[1]['comment']]
    else:
        additions.loc[len(additions)] = [row[1]['timestamp'],row[1]['user'],size,blue,row[1]['editDiff'],row[1]['comment']]
        
blueSource = ColumnDataSource(data=dict(
    x=additions['timestamp'],
    y=additions['user'],
    size = additions['size'], 
    color = additions['color'],
    editDiff = additions['editDiff'],
    comment = additions['comment']
))
redSource = ColumnDataSource(data=dict(
    x=deletions['timestamp'],
    y=deletions['user'],
    size = deletions['size'], 
    color = deletions['color'],
    editDiff = deletions['editDiff'],
    comment = deletions['comment']
))
p = figure(plot_width=1000, plot_height=3500, x_axis_type="datetime", y_range=users,x_axis_location="above",title='Wiki edits over time (Click on the points for more details)',
           tools = [TapTool(),BoxZoomTool(), PanTool(),WheelZoomTool(), UndoTool(), RedoTool(), ResetTool(), ZoomInTool(), ZoomOutTool()])
p.add_layout(DatetimeAxis(), 'below')
p.yaxis.major_label_text_font_size = "6pt"

code = """
    var data = source.data,
    selected = source.selected['1d']['indices'],
    select_inds = selected[0];
    date = new Date(data['x'][select_inds])
    name = "<p><b>User: </b>"+data['y'][select_inds]+"</p>"
    timestamp = "<p><b>Timestamp: </b>"+date+"</p>"
    editDiff = "<p><b>Bytes Changed: </b>"+data['editDiff'][select_inds]+" Bytes</p>"
    comment = "<p><b>Comment: </b>"+data['comment'][select_inds]+"</p>"
    div.text = name+timestamp+editDiff+comment
"""

div = Div(text="",
width=400, height=100)


blueSource.callback = CustomJS(args={'source':blueSource, 'div':div},code=code)
redSource.callback = CustomJS(args={'source':redSource, 'div':div},code=code)

p.circle('x','y', size='size', color=blue, alpha=0.3, source=blueSource, legend='Bytes added', line_width=0,
         selection_fill_alpha=1,selection_fill_color=blue, nonselection_fill_color=blue,selection_line_color=blue)
p.circle('x','y', size='size',color=red,alpha=0.3,source=redSource, legend = 'Bytes removed', line_width=0,
         selection_fill_alpha=1,selection_fill_color=red, nonselection_fill_color=red,selection_line_color=red)
p.legend.click_policy="hide"

heading = Div(text = '<h3>Details of the selected point</h3>',width=400,height=20)




output_file('wiki-edits.html')


show(Row(p,Column(heading,div)))

**Conclusion:** This visualization provides a great visualization for the given data set. We can see that there is a major difference in the size of the changes of minor edits and major edits. Most of the big red dots represent the edits when someone has remove the entire page. This is an act of vandalism and these users could be categorized as anti-parisio. Most of the big blue dots are  reverts of these acts of vandalism. The big cluster of blue dots on the top left is some of the reverts that were done by a bot: BakBOT.

Unlike the sample data scraped from wikipedia for previous homeworks, the size of each edit in this dataset can be easily classified into small and big. There are no interemediate sized edits.