In [3]:
import pandas as pd
import numpy as np
import networkx as nx

In [4]:
DATA_PATH = '/dlabdata1/turkish_wiki'

# Preprocessing edits for analysis

In [5]:
edits = pd.read_csv(f'{DATA_PATH}/processed_data/account_edits.csv', index_col = 0)
block_dates = [pd.to_datetime('2017-04-29', utc = True), pd.to_datetime('2020-01-15 19:00', utc = True)]
edits = edits[['event_user_id', 'event_timestamp', 'page_id']]
edits['event_timestamp'] = pd.to_datetime(edits['event_timestamp'],   utc = True)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


# Community graph


In [7]:
print(f"Number of distinct editors : {len(edits.event_user_id.unique())}")

Number of distinct editors : 212768


In [8]:
blocked_days = (block_dates[1] - block_dates[0]).days

## 1) Preblock graph (filtered to a similar time period as the block period)

In [9]:
preblock_edits = edits[(edits['event_timestamp'] < block_dates[0]) & (edits['event_timestamp'] > block_dates[0]-pd.Timedelta(days=blocked_days))]

In [13]:
preblock_edits.groupby('event_user_id').count().reset_index().sort_values('event_timestamp', ascending =False)

Unnamed: 0,event_user_id,event_timestamp,page_id
10332,573665,222814,222814
1105,90501,216000,216000
2800,221544,73737,73737
1061,86010,47235,47235
1471,123656,35633,35633
...,...,...,...
46323,842788,1,1
46322,842779,1,1
46321,842773,1,1
21400,675719,1,1


In [14]:
preblock_editors = preblock_edits.groupby('event_user_id').size().reset_index()

In [15]:
preblock_editors.columns = ['user_id', 'edit_count']

In [17]:
preblock_editors['edits_per_day'] = preblock_editors['edit_count']/(blocked_days)

In [18]:
preblock_editors = preblock_editors[preblock_editors['edit_count'] > 1]

In [19]:
preblock_edits = preblock_edits[preblock_edits['event_user_id'].isin(preblock_editors.user_id.unique())]

In [20]:
pageset_edits = preblock_edits.groupby('event_user_id')['page_id'].apply(set)

In [21]:
pageset_edits = pageset_edits.reset_index()

In [22]:
block_editors = edits[(edits['event_timestamp'] >= block_dates[0]) & (edits['event_timestamp'] < block_dates[1])]['event_user_id'].unique()

In [23]:
pageset_edits.loc[pageset_edits[pageset_edits.event_user_id.isin(block_editors)].index, 'migrated'] = True

In [24]:
pageset_edits['migrated'] = pageset_edits['migrated'].fillna(False)

In [25]:
pageset_edits

Unnamed: 0,event_user_id,page_id,migrated
0,25,"{815064, 222351}",False
1,39,"{9312, 1067105, 1571688, 644809, 414218, 18022...",False
2,47,"{1664134, 2124426, 2124428, 2124437, 61624}",False
3,137,"{647363, 473637, 1631751, 16958, 8148, 1908, 9...",False
4,146,"{518121, 554324, 919417}",True
...,...,...,...
39462,964121,{1149349},False
39463,964166,"{2230151, 2230081, 2230137}",False
39464,964213,{1679269},False
39465,964230,{2230208},False


In [14]:
def cartesian_product(*arrays):
    la = len(arrays)
    dtype = np.result_type(*arrays)
    arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
    for i, a in enumerate(np.ix_(*arrays)):
        arr[...,i] = a
    return arr.reshape(-1, la)

In [41]:
def get_pageset_adjacency(pageset):
    
    pageset_adjacency = cartesian_product(pageset,pageset)
    pageset_adjacency = pageset_adjacency.reshape((pageset.shape[0],pageset.shape[0],-1))

    pageset_adjacency = np.apply_along_axis(lambda arr: len(arr[0].intersection(arr[1])), 2, pageset_adjacency)

    n = pageset_adjacency.shape[0]
    pageset_adjacency[range(n), range(n)] = 0
    

    return pageset_adjacency

In [42]:
from sys import getsizeof

In [43]:
for elem in dir():
    print(f"{elem} : {getsizeof(globals()[elem])}")

DATA_PATH : 72
In : 440
Out : 376
_ : 112
_13 : 60749544
_24 : 96
_25 : 64
_33 : 112
_38 : 8096
_40 : 112
__ : 8096
___ : 112
__builtin__ : 88
__builtins__ : 88
__doc__ : 113
__loader__ : 16
__name__ : 57
__package__ : 16
__spec__ : 16
_dh : 80
_i : 74
_i1 : 109
_i10 : 153
_i11 : 126
_i12 : 92
_i13 : 62
_i14 : 300
_i15 : 397
_i16 : 417
_i17 : 74
_i18 : 127
_i19 : 437
_i2 : 86
_i20 : 127
_i21 : 66
_i22 : 66
_i23 : 72
_i24 : 84
_i25 : 90
_i26 : 437
_i27 : 127
_i28 : 72
_i29 : 580
_i3 : 382
_i30 : 127
_i31 : 584
_i32 : 127
_i33 : 66
_i34 : 87
_i35 : 87
_i36 : 89
_i37 : 115
_i38 : 171
_i39 : 121
_i4 : 123
_i40 : 66
_i41 : 603
_i42 : 74
_i43 : 119
_i5 : 102
_i6 : 194
_i7 : 128
_i8 : 101
_i9 : 120
_ih : 440
_ii : 603
_iii : 66
_oh : 376
block_dates : 88
blocked_days : 28
cartesian_product : 144
edits : 305224800
exit : 64
get_ipython : 72
get_pageset_adjacency : 144
getsizeof : 80
n : 28
np : 88
nx : 88
pageset_adjacency : 112
pageset_edits : 60749544
pd : 88
preblock_editors : 947240
preblo

In [None]:
pageset_adjacency = get_pageset_adjacency(pageset_edits.page_id.values)

(1557644089, 2)
(39467, 39467, 2)


In [None]:
pageset_graph = nx.from_numpy_array(pageset_adjacency)

In [51]:
nx.write_gexf(pageset_graph, 'pageset.gexf')

In [50]:
print('Number of nodes: {}, Number of edges: {}'. format(pageset_graph.number_of_nodes(), pageset_graph.number_of_edges()))
print('Number of self-loops: {}, Number of connected components: {}'. format(nx.number_of_selfloops(pageset_graph), nx.number_connected_components(pageset_graph)))

Number of nodes: 39467, Number of edges: 1701636
Number of self-loops: 0, Number of connected components: 2485


In [107]:
pageset_edits

Unnamed: 0,event_user_id,page_id,number_of_neighbors,centrality,migrated
0,25,"{815064, 222351}",5,4.578354e-04,False
1,39,"{9312, 1067105, 1571688, 644809, 414218, 18022...",178,6.456035e-03,False
2,47,"{1664134, 2124426, 2124428, 2124437, 61624}",12,6.382531e-04,False
3,137,"{647363, 473637, 1631751, 16958, 8148, 1908, 9...",98,4.934069e-03,False
4,146,"{518121, 554324, 919417}",11,8.682507e-04,True
...,...,...,...,...,...
39462,964121,{1149349},81,3.654886e-03,False
39463,964166,"{2230151, 2230081, 2230137}",0,4.410460e-58,False
39464,964213,{1679269},7,4.855382e-04,False
39465,964230,{2230208},0,4.410460e-58,False


In [62]:
pageset_edits['number_of_neighbors'] = np.count_nonzero(pageset_adjacency, axis = 1)

In [69]:
centrality = nx.eigenvector_centrality(pageset_graph)

In [76]:
pageset_edits['centrality'] = centrality.values()

In [164]:
from tqdm import tqdm
tqdm.pandas()

In [165]:
def get_neighbours_and_migration_pct(graph):
    pcts = np.zeros(graph.number_of_nodes())
    for node in tqdm(graph.nodes()):
        neigbours = pageset_edits.loc[list(graph.neighbors(node)), 'migrated']
        pct = neigbours.sum()/neigbours.shape[0]
        pcts[node] = pct
    return pcts

In [166]:
pcts = get_neighbours_and_migration_pct(pageset_graph)

  """
100%|██████████| 39467/39467 [00:32<00:00, 1215.75it/s]


In [151]:
set(pageset_edits.loc[pageset_edits.loc[list(pageset_graph.neighbors(0)), 'migrated'].index, 'event_user_id'].values)

{90501, 369043, 573665, 730823, 881112}

In [171]:
pageset_edits['migration_percentage'] = pcts

In [41]:
pageset_edits = pd.merge(pageset_edits, preblock_editors[['user_id', 'edits_per_day']], left_on ='event_user_id', right_on = 'user_id')

In [43]:
pageset_edits = pageset_edits[['event_user_id', 'page_id', 'number_of_neighbors', 'centrality',
       'migrated', 'migration_percentage', 'edits_per_day']]

In [45]:
pageset_edits.to_csv(f'{DATA_PATH}/processed_data/pageset_edits_pct.csv', index=False)