In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import sys
sys.path.append('/srv/home/christinedk/wp_internship/collaboration/')
from utils import get_edits_pre_tag
from features.article_history import *
from features.politeness import *

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None
from dateutil import parser
import datetime
import numpy as np
from tqdm import tqdm
import json
import matplotlib.pylab as plt
from math import log2
from features.talk_history import ConvParser

In [None]:
# chained

In [12]:
HOME = '/srv/home/christinedk/wp_internship/'
DATA_DIR = HOME + 'data/'

def read_revisions(filename, rename=False):
    revisions = pd.read_json(filename,lines=True)
    if rename:
        revisions = revisions.rename(columns = {'revision_timestamp':'event_timestamp','user_id':'event_user_id'})
    revisions['event_timestamp'] = pd.to_datetime(revisions['event_timestamp'])
    revisions = revisions.sort_values(by='event_timestamp', ascending=True)
    return revisions

def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()

In [13]:
ls /srv/home/christinedk/wp_internship/

[0m[01;34mcollaboration[0m/  [01;34mdata[0m/  [01;34mfeatures[0m/  [01;34mnotebooks[0m/  README.md  [01;34mscripts[0m/


In [25]:
for template in ['fanpov','weasel','autobiography','advert','peacock']:
    print(template)
    
    print('reading data')
    # read and format things
    page_revisions = read_revisions(DATA_DIR+'page_history/page_history-{}-meta-info.json'.format(template))
    article_talk_mappings = pd.read_csv(DATA_DIR+'article_talk_mappings/{}.csv'.format(template),
                                       usecols=['talk_page_id','article_page_id'])
    labels = pd.read_csv(DATA_DIR+'labels/{}.csv'.format(template),parse_dates=['event_timestamp'])
    labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id')
    
    # prepare to extract by page
    pages = page_revisions.groupby('page_id')
    
    features = []
    for tag_date, page_id, talk_page_id in tqdm(labels.values):
        user_article_feat = article_feat = {}

        page_revisions = pages.get_group(page_id)
        page_revisions = page_revisions[page_revisions.event_timestamp <= tag_date]

        if len(page_revisions) > 0:
            page_revisions = calculate_page_metrics(page_revisions)
            # user-article
            user_article_feat = get_user_article_features(page_revisions)
            # article
            article_feat = get_article_features(page_revisions, tag_date)
                
        features.append({'page':page_id,'date':str(tag_date),
                        'user_article':user_article_feat,
                        'article':article_feat})
    with open(HOME +'features/activity_'+template+'.json','w') as f:
        json.dump(features,f,default=np_encoder)


fanpov
read data


100%|██████████| 721/721 [00:13<00:00, 53.17it/s]


weasel
read data


100%|██████████| 1322/1322 [00:27<00:00, 48.91it/s]


autobiography
read data


100%|██████████| 4224/4224 [01:51<00:00, 37.78it/s]


advert
read data


100%|██████████| 7570/7570 [02:25<00:00, 52.18it/s]


peacock
read data


100%|██████████| 5174/5174 [01:50<00:00, 46.86it/s]


In [25]:
for template in ['fanpov','weasel','autobiography','advert','peacock']:
    print(template)
    
    print('reading data')
    # read and format things
    page_revisions = read_revisions(DATA_DIR+'page_history/page_history-{}-meta-info.json'.format(template))
    talk_revisions = read_revisions(DATA_DIR+'talk_history/talk-activity-{}-meta-info.json'.format(template),
                                    rename=True)
    article_talk_mappings = pd.read_csv(DATA_DIR+'article_talk_mappings/{}.csv'.format(template),
                                       usecols=['talk_page_id','article_page_id'])
    labels = pd.read_csv(DATA_DIR+'labels/{}.csv'.format(template),parse_dates=['event_timestamp'])
    labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id')
    
    # prepare to extract by page
    pages = page_revisions.groupby('page_id')
    talk_pages = talk_revisions.groupby('page_id')
    
    features = []
    for tag_date, page_id, talk_page_id in tqdm(labels.values):
        talk_features = user_article_feat = article_feat = {}

        page_revisions = pages.get_group(page_id)
        page_revisions = page_revisions[page_revisions.event_timestamp <= tag_date]

        if len(page_revisions) > 0:
            page_revisions = calculate_page_metrics(page_revisions)
            # user-article
            user_article_feat = get_user_article_features(page_revisions)
            # article
            article_feat = get_article_features(page_revisions, tag_date)

        if not np.isnan(talk_page_id):
            tag_talk_revisions = talk_pages.get_group(talk_page_id)
            tag_talk_revisions = tag_talk_revisions[tag_talk_revisions.event_timestamp.dt.date <= tag_date]

            if len(tag_talk_revisions) > 0:
                tag_talk_revisions = calculate_page_metrics(tag_talk_revisions)

                # talk page; volume
                talk_features = get_talk_features(tag_talk_revisions)
                talk_features['page_talk_ratio'] = len(page_revisions)/len(tag_talk_revisions)
                
        features.append({'page':page_id,'date':str(tag_date),
                        'user_article':user_article_feat,
                        'article':article_feat,
                        'talk':talk_features})
    with open(HOME +'features/activity_'+template+'.json','w') as f:
        json.dump(features,f,default=np_encoder)


fanpov
read data


100%|██████████| 721/721 [00:13<00:00, 53.17it/s]


weasel
read data


100%|██████████| 1322/1322 [00:27<00:00, 48.91it/s]


autobiography
read data


100%|██████████| 4224/4224 [01:51<00:00, 37.78it/s]


advert
read data


100%|██████████| 7570/7570 [02:25<00:00, 52.18it/s]


peacock
read data


100%|██████████| 5174/5174 [01:50<00:00, 46.86it/s]


In [None]:
# load everything

In [6]:
template = 'fanpov'

In [None]:
revisions = pd.read_json('/srv/home/christinedk/wp_internship/data/page_history/page_history-{}-meta-info.json'.format(template),
                        lines=True)
revisions['event_timestamp'] = pd.to_datetime(revisions['event_timestamp'])
revisions = revisions.sort_values(by='event_timestamp', ascending=True)

In [None]:
labels = pd.read_csv('/srv/home/christinedk/wp_internship/data/labels/{}.csv'.format(template),
                              parse_dates=['event_timestamp'])

In [None]:
talk_revisions = pd.read_json('/srv/home/christinedk/wp_internship/data/talk_history/talk-activity-{}-meta-info.json'.format(template),
             lines=True)
talk_revisions.rename(columns = {'revision_timestamp':'event_timestamp','user_id':'event_user_id'},inplace=True)
talk_revisions['event_timestamp'] = pd.to_datetime(talk_revisions['event_timestamp'])
talk_revisions = talk_revisions.sort_values(by='event_timestamp', ascending=True)

article_talk_mappings = pd.read_csv('/srv/home/christinedk/wp_internship/data/article_talk_mappings/{}.csv'.format(template))[['talk_page_id','article_page_id']]

In [None]:
labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id')

In [None]:
pages = revisions.groupby('page_id')
talk_pages = talk_revisions.groupby('page_id')
conv_parser = ConvParser()

features = []

for tag_date, page_id, talk_page_id in tqdm(labels.values):
    talk_features = lang_features = user_article_feat = article_feat = {}

    page_revisions = pages.get_group(page_id)
    page_revisions = page_revisions[page_revisions.event_timestamp <= tag_date]

    if len(page_revisions) > 0:
        page_revisions = calculate_page_metrics(page_revisions)
        # user-article
        user_article_feat = get_user_article_features(page_revisions)
        # article
        article_feat = get_article_features(page_revisions, tag_date)
      
    if not np.isnan(talk_page_id):
        tag_talk_revisions = talk_pages.get_group(talk_page_id)
        tag_talk_revisions = tag_talk_revisions[tag_talk_revisions.event_timestamp.dt.date <= tag_date]

        if len(tag_talk_revisions) > 0:
            tag_talk_revisions = calculate_page_metrics(tag_talk_revisions)

            # talk page; volume
            talk_features = get_talk_features(tag_talk_revisions)
            talk_features['page_talk_ratio'] = len(page_revisions)/len(tag_talk_revisions)

            # talk page; language
            talk_latest = tag_talk_revisions[['revision_text','page_id']].iloc[-1].values
            lang_features = conv_parser.get_language_features(*talk_latest)
    
    features.append({'page':page_id,'date':tag_date,
                    'user_article':user_article_feat,
                    'article':article_feat,
                    'talk':talk_features})