## This notebook explores the relationship between the topics of Wikipedia pages edited by users and the occurrence of votes between two users. The goal is to identify potential correlations and patterns that would show that editing similar topics has an influence in the motivation to cast a vote.

In [15]:
import pandas as pd
import numpy as np
from itertools import combinations
import gzip
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
# Download the dataset with all the votes
df = edits_df = pd.read_csv("../data/wiki-RfA-cleaned.csv", index_col=0)

#Set of users that are present in the adminship dataset
admin_set = set(df.index.to_list() + df['target'].to_list())

In [17]:
# The original dataset can be found here (https://snap.stanford.edu/data/wiki-meta.html). 
# The version that we use here has already been modified so that we get each user and 
# the page they modified with the number of edits

edits_df = pd.read_csv("../data/interactions_edits_grouped.zip", index_col=0, compression='zip')
edits_df

Unnamed: 0_level_0,username,counts
article_title,Unnamed: 1_level_1,Unnamed: 2_level_1
!,MPerel,1
!!,Gracenotes,1
!!,Gwern,1
!!,JHunterJ,8
!!,Kane5187,1
...,...,...
ﻲ,Dbenbenn,1
ﻳ,Dbenbenn,1
ﻴ,Dbenbenn,1
﻿,Glen,1


In [18]:
# We create a list of Wikipedia pages modified by each users
user_indices = edits_df.groupby('username').apply(lambda x: x.index.tolist()).reset_index(name='Indices')
user_indices

Unnamed: 0,username,Indices
0,'sed,"[Bloody_Mary_(South_Park), Cartoon_Wars_Part_I..."
1,(.Y.),"[Blue_Water_High, List_of_bass_guitarists, Lis..."
2,(:Julien:),"[1993_Russian_constitutional_crisis, 1999_NBA_..."
3,(aeropagitica),"['Aho'eitu, 'Ilaheva, ...Und_Morgen_Fliegen_Wi..."
4,*Kat*,"[&_Yet_&_Yet, *MTV, .nc.tr, 1670, 1964_(emulat..."
...,...,...
6586,^demon,"[""Comedy_Time""_Ganesh, ""Them""_(King_Diamond_al..."
6587,~K,"[(Bis(trifluoroacetoxy)iodo)benzene, 1,2,4-Tri..."
6588,Вasil,"[1809, 1870, 1877, 1964_Minnesota_Twins_season..."
6589,Еdit,"[1600_Pennsylvania_Avenue_(musical), 2005_Atla..."


In [19]:
# Create a list of all users present in the edits dataset
users = set(edits_df['username'].tolist())

#### Create a matrix with the Jaccard index (on the the lists of modified pages) for all pairs of users

In [20]:
matrix_similarity = pd.read_csv("../data/jaccard.csv.zip", index_col=0, compression='zip')
matrix_similarity

Unnamed: 0,Bellhalla,Complainer,Werdan7,Grön,Weasel,Filll,Thejerm,Cyclone1,RJFJR,Mkweise,...,UtherSRG,Pyrop,AngelOfSadness,Clarkk,Appleboy,Voyaging,JLaTondre,InShaneee,Ed,Advanced
Bellhalla,1.000000,0.000173,0.000808,0.000000,0.000000,0.000579,0.000315,0.000173,0.001307,0.000426,...,0.001782,0.000296,0.001437,0.000543,0.000334,0.000000,0.004429,0.001600,0.000168,0.000000
Complainer,0.000173,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000492,0.000000,0.000369,0.000000,0.000000,0.000000,0.000098,0.000000,0.000000,0.000000
Werdan7,0.000808,0.000000,1.000000,0.000000,0.000000,0.001804,0.000000,0.000000,0.000596,0.002238,...,0.001802,0.002651,0.008425,0.000504,0.009511,0.002049,0.002832,0.006304,0.004392,0.005298
Grön,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000104,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.003106,0.000000,0.000196,0.000324,0.000000,0.000000
Weasel,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000246,0.000000,0.000000,0.000000,0.000000,0.000000,0.000195,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Voyaging,0.000000,0.000000,0.002049,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.003484,1.000000,0.000197,0.000327,0.004348,0.000000
JLaTondre,0.004429,0.000098,0.002832,0.000196,0.000195,0.001678,0.002697,0.000000,0.005550,0.004555,...,0.008998,0.002688,0.009551,0.002571,0.002116,0.000197,1.000000,0.011813,0.000676,0.000959
InShaneee,0.001600,0.000000,0.006304,0.000324,0.000000,0.004031,0.003552,0.000644,0.005090,0.007861,...,0.007856,0.006660,0.014829,0.001686,0.009451,0.000327,0.011813,1.000000,0.001847,0.001505
Ed,0.000168,0.000000,0.004392,0.000000,0.000000,0.000711,0.000000,0.000000,0.000408,0.001306,...,0.000549,0.001596,0.001755,0.000270,0.006224,0.004348,0.000676,0.001847,1.000000,0.002000


Create a DataFrame with all pairs of users and a binary variable that indicates if a vote exists for each pair.

In [21]:
similarity_and_vote = pd.read_csv("../data/jaccard_and_votes.csv.zip", index_col=0, compression='zip')
similarity_and_vote

Unnamed: 0,vote,jaccard
"('Bellhalla', 'Complainer')",0,0.000173
"('Bellhalla', 'Werdan7')",0,0.000808
"('Bellhalla', 'Grön')",0,0.000000
"('Bellhalla', 'Weasel')",0,0.000000
"('Bellhalla', 'Filll')",0,0.000579
...,...,...
"('JLaTondre', 'Ed')",0,0.000676
"('JLaTondre', 'Advanced')",0,0.000959
"('InShaneee', 'Ed')",0,0.001847
"('InShaneee', 'Advanced')",0,0.001505


### Now we compute some statistics on this relation.

In [22]:
pearsonr(similarity_and_vote['vote'], similarity_and_vote['jaccard'])

PearsonRResult(statistic=0.08914982798437746, pvalue=0.0)

The correlation between similarity score on edited pages and the votes is not very strong but positive with high significance. 

In [23]:
# Mean similarity between all pairs of users
mean_sim_all = similarity_and_vote['jaccard'].mean()
mean_sim_all

0.0012019944311329655

In [24]:
# Mean similarity between pairs of users that are linked by a vote
mean_sim_vote = similarity_and_vote[similarity_and_vote['vote'] == 1]['jaccard'].mean()
mean_sim_vote

0.005487495987236598

In [25]:
print(f"People that are linked by a vote have {mean_sim_vote / mean_sim_all:.2f} times more common edited pages than the average.")

People that are linked by a vote have 4.57 times more common edited pages than the average.
