In [3]:
import pandas as pd
import numpy as np

## Calculate the Gini Coefficient representing the centralization or inequality in contributions accross registered account edits.
This is a normalized measure of inequality, that we will apply to study the
distribution (or dispersion) of contributions from Wikipedia authors.
We consider a population comprising $n$ individuals. Let $p(i)$ be the cumulative percentage of the population represented by all contributors up to the i-th individual (sorted by their amount of contribution, in ascending order). Let $q(i)$ be the cumulative percentage of the parameter under study contributed by all previous individuals up to the $\mathrm{i}$ -th subject (included). Then, the value of the Gini coefficient is given by the following equation:
$G=\frac{\sum_{i=1}^{n-1}[p(i)-q(i)]}{\sum_{i=1}^{n-1} p(i)}$

Consequently, values of the Gini coefficient close to 0 correspond to equal or almost equal
distributions (lower departures from the line of perfect equality), while values close to 1 are good
indicators of high inequalities.

In [4]:
DATA_PATH = '/dlabdata1/turkish_wiki'

In [5]:
# Registered page edits
edits = pd.read_csv(f'{DATA_PATH}/processed_data/account_edits.csv', usecols=['event_user_id', 'event_timestamp', 'page_id'], index_col = 0)

# Dates of the initial ban and the lifting of the ban
block_dates = [pd.to_datetime('2017-04-29', utc = True), pd.to_datetime('2020-01-15', utc = True)]

In [6]:
edits = edits.reset_index()

In [7]:
edits['event_timestamp'] = pd.to_datetime(edits['event_timestamp'],   utc = True)

In [8]:
edits['period'] = np.nan
edits.loc[edits['event_timestamp'] < block_dates[0], 'period'] = 'pre-block'
edits.loc[(edits['event_timestamp'] >= block_dates[0]) & (edits['event_timestamp'] < block_dates[1]) , 'period'] = 'block'
edits.loc[edits['event_timestamp'] >= block_dates[1], 'period'] = 'after-block'

We will calculate the Gini coefficient of contributions on a daily, weekly and monthly basis to evaulate its' evolution in time due to the ban.
### 1) Daily Gini Coefficient

In [281]:
def calculate_gini(edits, timeframe, stratification_column = None):
    
    first_grouper = ['period',  pd.Grouper(key = 'event_timestamp', freq=timeframe), 'event_user_id']
    columns = ['period', 'date',  'user_id', 'number_of_edits']
    second_grouper = ['period', 'date']
    
    if stratification_column is not None:
        first_grouper.append(stratification_column)
        columns.insert(-1, stratification_column)
        second_grouper.append(stratification_column)
        
    periodical_stratified_edits = edits.groupby(first_grouper).size().reset_index()
    periodical_stratified_edits.columns = columns
    periodical_stratified_edits['period'] = pd.Categorical(periodical_stratified_edits['period'], ["pre-block", "block", "after-block"])

    periodical_stratified_edits = periodical_stratified_edits.sort_values(second_grouper + ['number_of_edits'])

    periodical_stratified_edits['cumulative_edit_pct'] = periodical_stratified_edits.groupby(second_grouper)['number_of_edits'].apply(lambda x: (x/x.sum()).cumsum())
    periodical_stratified_edits['cumulative_user_pct'] = periodical_stratified_edits.groupby(second_grouper)['user_id'].cumcount() + 1

    periodical_stratified_edits['cumulative_user_pct'] = periodical_stratified_edits.groupby(second_grouper).apply(lambda x: x['cumulative_user_pct']/len(x)).values

    periodical_stratified_edits['p_i - q_i'] = periodical_stratified_edits['cumulative_user_pct'] - periodical_stratified_edits['cumulative_edit_pct']

    periodical_gini = periodical_stratified_edits.groupby(second_grouper).agg({'p_i - q_i' : 'sum', 'cumulative_user_pct' : 'sum'})

    periodical_gini['gini'] = periodical_gini['p_i - q_i']/periodical_gini['cumulative_user_pct']

    periodical_gini = periodical_gini[['gini']]

    periodical_gini = periodical_gini.reset_index()
    periodical_gini = periodical_gini.dropna()
    periodical_gini['date'] = pd.to_datetime(periodical_gini['date'],   utc = True)
    
    if stratification_column is not None:
        periodical_gini = periodical_gini.set_index(['date', 'period', stratification_column])
        periodical_gini = periodical_gini.reindex(
                pd.MultiIndex.from_product([periodical_gini.index.levels[0], 
                                            periodical_gini.index.levels[1],
                                            periodical_gini.index.levels[2]], 
                                           names=['date', 'period', stratification_column]), fill_value=0)


        periodical_gini = periodical_gini.reset_index().set_index(['date', 'period'])

        periodical_gini = periodical_gini.loc[periodical_gini.gini.groupby(periodical_gini.index).sum() != 0].reset_index()

    
    
    return periodical_gini

In [282]:
daily_gini = calculate_gini(edits, timeframe='D')

In [302]:
daily_gini.to_csv(f'{DATA_PATH}/processed_data/daily_gini.csv', index = False)

### 2) Weekly Gini Coefficient 

In [285]:
weekly_gini = calculate_gini(edits, timeframe='7D')

In [303]:
weekly_gini.to_csv(f'{DATA_PATH}/processed_data/weekly_gini.csv', index = False)

### 3) Monthly Gini Coefficient 

In [288]:
monthly_gini = calculate_gini(edits, timeframe='30D')

In [304]:
monthly_gini.to_csv(f'{DATA_PATH}/processed_data/monhtly_gini.csv', index = False)

## Calculate the Gini Coefficient for article topic stratification
The Gini Coefficient measuring the collaboration dynamics between registered users can be stratified with the Article Topics that we obtain through WikiPDA. The idea of the formula is the same, except that instead of considering all articles and all edits in a given timeframe as we did above, we stratify articles by their categories and calculate the periodical Gini Coefficient separately for each Topic.


In [11]:
topic_df = pd.read_csv('/dlabdata1/turkish_wiki/processed_data/thresholded_topics.csv')

In [17]:
topic_edits = pd.merge(edits, topic_df)

In [19]:
topic_edits.head()

Unnamed: 0,event_timestamp,event_user_id,page_id,period,page_title,topic
0,2003-03-13 16:30:04+00:00,6,10,pre-block,Cengiz_Han,Society
1,2003-03-13 16:30:04+00:00,6,10,pre-block,Cengiz_Han,West Asia
2,2003-03-13 16:30:04+00:00,6,10,pre-block,Cengiz_Han,Asia_
3,2003-03-13 16:30:04+00:00,6,10,pre-block,Cengiz_Han,Central Asia
4,2003-03-13 16:30:04+00:00,6,10,pre-block,Cengiz_Han,East Asia


### 1) Daily Stratified Gini Coefficient 

In [291]:
daily_stratified_gini = calculate_gini(topic_edits, 'D', stratification_column='topic')

In [307]:
daily_stratified_gini.to_csv(f'{DATA_PATH}/processed_data/daily_topic_gini.csv', index = False)

### 2) Weekly Stratified Gini Coefficient 

In [292]:
weekly_stratified_gini = calculate_gini(topic_edits, '7D', stratification_column='topic')

In [306]:
weekly_stratified_gini.to_csv(f'{DATA_PATH}/processed_data/weekly_topic_gini.csv', index = False)

### 3) Monthly Stratified Gini Coefficient 

In [293]:
monthly_stratified_gini = calculate_gini(topic_edits, '30D', stratification_column='topic')

In [308]:
monthly_stratified_gini.to_csv(f'{DATA_PATH}/processed_data/monthly_topic_gini.csv', index = False)