In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from mw_api_diff_utils import *
import pandas as pd
from db_utils import query_hive_ssh

# Random Sample

Consider comments made since min_timestamp. Take up to k random comments from each of n random users

In [5]:
k = 1
n = 1000
min_timestamp = '2014-03-01T00:00:00Z'

In [6]:
query = """
SELECT 
    revisions.*
FROM
    (SELECT
        a.*
        FROM
        (
        SELECT
            d.*,
            RANK() OVER (PARTITION BY d.user_id ORDER BY RAND()) as rank
        FROM
            enwiki.user_talk_diff d
        WHERE 
            rev_timestamp > '%(min_timestamp)s'
        ) a
    WHERE rank <= %(k)d
    )
    revisions

JOIN
    (
    SELECT
        a.*
    FROM (
        SELECT
            user_id,
            RAND() as key
        FROM
            enwiki.user_talk_diff d
        WHERE 
            rev_timestamp > '%(min_timestamp)s'
        GROUP BY user_id
        ) a
    ORDER BY key
    LIMIT %(n)d
    ) users
ON
    revisions.user_id = users.user_id
"""

params = {
    'k': k,
    'n': n,
    'min_timestamp': min_timestamp
    }


random_sample_df = query_hive_ssh(query % params, 'random_sample.tsv', priority = True)

In [9]:
random_sample_df.columns = [c.split('.')[1] for c in random_sample_df.columns]
random_sample_df.to_csv('../data/mini_random_sample.tsv', sep = '\t')

# Blocked Users
For n random user block events that occured after min_timestamp, take up to the k most recent comments before being blocked for harassment or personal attacks.

In [7]:
k = 25
n = 1000
min_timestamp = '2010-03-01T00:00:00Z'

In [8]:
query = """

SELECT
    revisions.*
FROM
    (
    SELECT 
        diffs.*,
        RANK() OVER (PARTITION BY events.key, events.user_text ORDER BY diffs.rev_timestamp DESC) as rank
    FROM
        (
        SELECT
            a.*
        FROM
            (
            SELECT 
              log_title as user_text,
              log_timestamp,
              RAND() as key
            FROM enwiki.logging 
            WHERE
              log_type = 'block'
              AND log_action = 'block'
              AND log_comment RLIKE 'harassment|personal attack'
              AND log_timestamp > '%(min_timestamp)s'
            ) a
        ORDER BY key
        LIMIT %(n)d
        ) events
    JOIN
        enwiki.user_talk_diff diffs
    ON
        diffs.user_text = events.user_text
    WHERE
        diffs.rev_timestamp <= events.log_timestamp
    ) revisions
WHERE
    rank <= %(k)d
"""

params = {
    'k': k,
    'n': n,
    'min_timestamp': min_timestamp
    }


blocked_user_df = query_hive_ssh(query % params, 'random_sample.tsv', priority = True)

In [10]:
blocked_user_df.columns = [c.split('.')[1] for c in blocked_user_df.columns]
blocked_user_df.to_csv('../data/blocked_user_sample.tsv', sep = '\t')

# Cleaning

1. remove dupilicate rev_ids
3. remove duplicate diffs
4. clean diffs
     - ignore diffs with no content added
     - replace 'NEWLINE' with '\n'
     - remove comments from 'The Wikipedia Adventure'
     - strip mw markup


In [65]:
def clean(df):
    df.rename(columns = {'insertion': 'diff'}, inplace = True)
    df.dropna(subset = ['diff'], inplace = True)
    df['clean_diff'] = df['diff']
    df = df[df['clean_diff'].apply(lambda x: 'Welcome to The Wikipedia Adventure!' not in x)]
    df['clean_diff'] = df['clean_diff'].apply(lambda x: x.replace('NEWLINE', '\n'))
    df['clean_diff'] = df['clean_diff'].apply(strip_mw)
    del df['rank']
    df.drop_duplicates(subset = ['rev_id'], inplace = True)
    df.drop_duplicates(subset = ['diff'], inplace = True)
    return df

In [66]:
clean(random_sample_df).to_csv('../data/mini_random_sample.tsv', sep = '\t')
clean(blocked_user_df).to_csv('../data/blocked_user_sample.tsv', sep = '\t')

for i, r in clean(blocked_user_df).iterrows():
    print (r['clean_diff'])
    print('_' * 60)
    if i > 100:
        break