In [1]:
import wikipedia
import pandas as pd
import numpy as np
import os
import sys
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


In [2]:
def read_lightdump(fp):
    '''
	Reads in n lightdump pages and returns a list of all titles 
    read and their corresponding data as a DataFrame
	:param fp: input filepath
	:param n: number of articles to read
	:return: list of article titles, list of corresponding article lightdump data as DataFrame
	'''
    
    with open(fp) as file:
        df = pd.DataFrame(columns = ['timestamp', 'revert', 'revision_id', 'length', 'user'])
        for line in file:
            if '^^^_' not in line:
                title = line.strip('\n').strip()

            else:
                data = line.strip("^^^_").strip('\n').split()
                row = pd.Series(dtype = 'object')

                row['timestamp'] = data[0]
                row['revert'] = int(data[1])
                row['revision_id'] = int(data[2])
                row['length'] = int(data[3])
                row['user'] = data[4]

                df = df.append(row, ignore_index = True)

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return title, df

In [3]:
# M STAT CALCULATION: single article
def calculate_M(edits):
    edits = edits.values.tolist()
    edits.reverse()
    
    #cannot have edit war with 2 edits
    if len(edits) <= 2:
        return 0
    
    #cannot have edit war with less than 2 reverts
    try:
        num_reverts = sum([int(x[1]) for x in edits])
        if num_reverts < 2:
            return 0
    except:
        pass # bad data
    
    #M STAT: find revert pairs
    revert_pairs = []

    for lst in edits:
        if len(lst) < 4: # skip bad data
            continue
        
        if lst[1]=='1':  # is a revert
            user_one = lst[3]
            org_idx = int(lst[2])-1
            try:
                user_two = edits[org_idx][3]
            except:
                continue
            
            # exclude self revert
            if user_one == user_two:
                continue
            
            if (user_one, user_two) not in revert_pairs:
                revert_pairs.append((user_one, user_two))
                
                
    if len(revert_pairs) == 0:
        print("There are no reverting pairs")

    #M STAT: find mutual reverts
    mutual_rev_users = []
    mutual_rev_pairs = []
    for pair in revert_pairs:
        one = pair[0]
        two = pair[1]

        #mutual revert found
        if (two, one) in revert_pairs:
            mutual_rev_pairs.append((one, two))
            mutual_rev_users.append(two)
            mutual_rev_users.append(one)

    #remove duplicates, calculate num
    E = len(list(set(mutual_rev_users)))

    if E == 0:
        return 0
    
    #get num edits per user
    users = [x[3] for x in edits if len(x) == 4]
    user_edits = dict((x,users.count(x)) for x in set(users))
    
    #calculate M
    M = 0
    
    for pair in list(set(mutual_rev_pairs)):
        one = pair[0]
        two = pair[1]
        if user_edits[one] < user_edits[two]:
            N = user_edits[one]
        else:
            N = user_edits[two]

        M += N
    
    M *= E
    return M

In [4]:
#Read in data
files = [
    '../data/raw/light_dump/bts_light_dump.txt',
    '../data/raw/light_dump/Blackpink_light_dump.txt',
    '../data/raw/light_dump/Girls_Generation_light_dump.txt',
    '../data/raw/light_dump/Backstreet_Boys_light_dump.txt',
    '../data/raw/light_dump/Justin_Bieber_light_dump.txt',
    '../data/raw/light_dump/Taylor_Swift_light_dump.txt',
    '../data/raw/light_dump/All_Lives_Matter_light_dump.txt',
    '../data/raw/light_dump/Black_Lives_Matter_light_dump.txt',
    '../data/raw/light_dump/Blue_Lives_Matter_light_dump.txt',
]

In [122]:
summary = pd.DataFrame([], columns = ['title', 
                                      'M', 
                                      'num_edits', 
                                      'num_reverts', 
                                      'num_users', 
                                      'avg_edits_per_user',
                                      'num_bots', 
                                      'bot_edits', 
                                      'bot_reverts'
                                     ])

In [127]:
for fp in files:
    title, df = read_lightdump(fp)
    data = [title]
    data.append(calculate_M(df))
    data.append(df.shape[0])
    data.append(df.revert.sum())
    unique_users = df.user.str.lower().unique()
    data.append(len(unique_users))
    data.append(df.groupby('user').count().revert.mean())
    data.append(len([x for x in unique_users if 'bot' in x]))
    df['bots'] = df.user.apply(lambda x: 1 if 'bot' in x else 0)
    data.append(df[df.bots == 1].shape[0])
    data.append(df[(df.revert == 1)&(df.bots == 1)].shape[0])
    data = pd.Series(data, index = summary.columns)
    summary = summary.append(data, ignore_index = True)

There are no reverting pairs
There are no reverting pairs
There are no reverting pairs
There are no reverting pairs
There are no reverting pairs
There are no reverting pairs
There are no reverting pairs
There are no reverting pairs
There are no reverting pairs


In [130]:
summary['revert/edit'] = summary.num_reverts / summary.num_edits
summary['bot/users'] = summary.num_bots / summary.num_users
summary.iloc[9:]

Unnamed: 0,title,M,num_edits,num_reverts,num_users,avg_edits_per_user,num_bots,bot_edits,bot_reverts,revert/edit,bot/users
9,BTS,0,6242,1009,1565,3.988498,24,34,1,0.161647,0.0153355
10,Blackpink,0,4316,823,1167,3.698372,19,23,0,0.190686,0.0162811
11,Girls' Generation,0,12609,1496,3782,3.333069,47,42,0,0.118645,0.0124273
12,Backstreet Boys,0,10392,2299,4088,2.542074,78,63,9,0.221228,0.0190802
13,Justin Bieber,0,9466,2260,3114,3.038844,62,76,3,0.238749,0.0199101
14,Taylor Swift,0,6540,617,1619,4.039531,54,33,0,0.0943425,0.0333539
15,All Lives Matter,0,356,94,180,1.977778,7,2,0,0.264045,0.0388889
16,Black Lives Matter,0,4226,563,1023,4.130987,19,29,0,0.133223,0.0185728
17,Blue Lives Matter,0,452,128,238,1.89916,12,7,0,0.283186,0.0504202


========== Scratch ==========

## bts

In [45]:
fp = files[0]
title, bts = read_lightdump(fp)

In [46]:
bts.head()

Unnamed: 0,timestamp,revert,revision_id,length,user
0,2013-07-04 19:45:15+00:00,0,1,7716,Hinorisakamachi
1,2013-07-04 19:47:39+00:00,0,2,7670,Hinorisakamachi
2,2013-07-04 19:59:17+00:00,0,3,7270,Hinorisakamachi
3,2013-07-04 19:59:53+00:00,0,4,7270,Hinorisakamachi
4,2013-07-04 20:19:54+00:00,0,5,7146,39.198.179.240


In [120]:
bts.groupby('user').count().revert.mean()

3.9884984025559107

In [105]:
data = [title]

In [106]:
data.append(calculate_M(bts))
calculate_M(bts)

There are no reverting pairs
There are no reverting pairs


0

In [107]:
data.append(bts.shape[0])
data.append(bts.revert.sum())
bts.revert.sum() #num reverts
bts.shape[0] #num edits

6242

In [108]:
# revert to edit ratio
bts.revert.sum() / bts.shape[0] 

0.16164690804229415

In [109]:
# bot ratio
unique_users = bts.user.str.lower().unique()
len([x for x in unique_users if 'bot' in x]) / len(unique_users)

data.append(len(unique_users))
data.append(len([x for x in unique_users if 'bot' in x]))

In [110]:
# ratio of bot edits
bots = [x for x in unique_users if 'bot' in x]
bts['bots'] = bts.user.apply(lambda x: 1 if 'bot' in x else 0)
bts[bts.bots == 1].shape[0] / bts.shape[0]

data.append(bts[bts.bots == 1].shape[0])

In [111]:
#ratio of bot reverts
bts[(bts.revert == 1)&(bts.bots == 1)].shape[0] / bts[bts.revert == 1].shape[0]

data.append(bts[(bts.revert == 1)&(bts.bots == 1)].shape[0])

In [112]:
data = pd.Series(data, index = summary.columns)
summary = summary.append(data, ignore_index = True)

## justin biebs

In [30]:
fp = files[4]
title, df = read_lightdump(fp)

In [31]:
df.head()

Unnamed: 0,timestamp,revert,revision_id,length,user
0,2008-04-22 13:44:17+00:00,0,1,1427,Jescelle777
1,2008-04-22 13:49:00+00:00,0,2,1507,Jescelle777
2,2009-06-14 01:33:04+00:00,0,3,492,Morts623
3,2009-06-14 01:33:41+00:00,0,4,506,Blanchardb
4,2009-06-14 01:36:06+00:00,0,5,507,Morts623


In [32]:
calculate_M(df)

There are no reverting pairs


0

In [54]:
df.revert.sum() #num reverts
df.shape[0] #num edits

9466

In [33]:
# revert to edit ratio
df.revert.sum() / df.shape[0] 

0.23874920769068245

In [34]:
# bot ratio
unique_users = df.user.str.lower().unique()
len([x for x in unique_users if 'bot' in x]) / len(unique_users)

0.01991008349389852

In [40]:
# ratio of bot edits
bots = [x for x in unique_users if 'bot' in x]
df['bots'] = df.user.apply(lambda x: 1 if 'bot' in x else 0)
df[df.bots == 1].shape[0] / df.shape[0]

0.008028734417916754

In [44]:
#ratio of bot reverts
df[(df.revert == 1)&(df.bots == 1)].shape[0] / df[df.revert == 1].shape[0]

0.001327433628318584

## Summary

In [113]:
summary

Unnamed: 0,title,M,num_edits,num_reverts,num_users,num_bots,bot_edits,bot_reverts
0,BTS,0,6242,1009,1565,24,34,1


In [114]:
summary['revert/edit'] = summary.num_reverts / summary.num_edits

In [115]:
summary['bot/users'] = summary.num_bots / summary.num_users

In [116]:
summary

Unnamed: 0,title,M,num_edits,num_reverts,num_users,num_bots,bot_edits,bot_reverts,revert/edit,bot/users
0,BTS,0,6242,1009,1565,24,34,1,0.161647,0.0153355
