In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
HOME = '/srv/home/christinedk/wp_internship/'
DATA_DIR = HOME + 'data/'

In [9]:
import pandas as pd
from dateutil import parser
import datetime
import numpy as np
import json
import matplotlib.pylab as plt
from tqdm import tqdm

import sys
sys.path.append('/srv/home/christinedk/wp_internship/collaboration/')
from features.article_history import *
from features.talk_history import *
from utils import read_revisions, np_encoder

In [6]:
ls /srv/home/christinedk/wp_internship/data/page_history/

page_history-advert-meta-info.json         page_history-peacock-meta-info.json
page_history-autobiography-meta-info.json  page_history-weasel-meta-info.json
page_history-fanpov-meta-info.json


# Get subset from Talk

In [6]:
for template in ['autobiography','weasel','advert','fanpov','peacock']:
    print(template)

    article_talk_mappings = pd.read_csv(DATA_DIR+'article_talk_mappings/{}.csv'.format(template),
                                       usecols=['talk_page_id','article_page_id'])
    labels = pd.read_csv(DATA_DIR+'labels/{}.csv'.format(template),parse_dates=['event_timestamp'])
    labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id').dropna()
    print('number of labels: ',len(labels))
    
    page_labels = labels.groupby('talk_page_id')
    pages = labels.talk_page_id.unique()
    
    talk = {}
    with open(DATA_DIR + 'talk_history/talk-text-{}-meta-info.json'.format(template),'rb') as f:
        for line in f:
            snapshot=json.loads(line)
            page_id = snapshot['page_id']
            if page_id not in pages:
                continue

            snapshot_date = parser.parse(snapshot['revision_timestamp']).replace(tzinfo=None)
            page_lable_dates = page_labels.get_group(snapshot['page_id'])['event_timestamp']
            date_diffs = (snapshot_date - page_lable_dates).dt.days
            min_ind = date_diffs.idxmin()
            min_diff = date_diffs[min_ind]
            if 0 <= min_diff < talk.get((page_id,page_lable_dates[min_ind])[0],np.inf):
                talk[page_id,page_lable_dates[min_ind]]=(min_diff,snapshot)

    print(len(talk))
    talk_dump = [{'talk_page_id':key[0],'event_timestamp':str(key[1]),**value[1]} for key, value in talk.items()]
    with open(DATA_DIR+'talk_history/talk-subset-{}.json'.format(template),'w') as f:
        json.dump(talk_dump, f, default=np_encoder)

autobiography
number of labels:  4224
2993
weasel
number of labels:  1322
1040
advert
number of labels:  7570
5315
fanpov
number of labels:  721
483
peacock
number of labels:  5174
3842


# Extract Talk features

In [28]:
%%time 
conv_parser = ConvParser()
feature_extractor = FeatureExtractor()

for template in ['advert','autobiography','fanpov','weasel','peacock']:
    print(template)
    
    print('reading data')
    # read and format things
    page_revisions = read_revisions(DATA_DIR+'page_history/page_history-{}-meta-info.json'.format(template))
    talk_revisions  = read_revisions(DATA_DIR+'talk_history/talk-activity-{}-meta-info.json'.format(template),
                                    rename=True)
    
    talk_text = pd.read_json(DATA_DIR+'talk_history/talk-subset-{}.json'.format(template))
    talk_text['event_timestamp'] = pd.to_datetime(talk_text['event_timestamp'])
    talk_text = talk_text.set_index(['talk_page_id','event_timestamp'])[['revision_text','page_id']]

    article_talk_mappings = pd.read_csv(DATA_DIR+'article_talk_mappings/{}.csv'.format(template),
                                       usecols=['talk_page_id','article_page_id'])
    labels = pd.read_csv(DATA_DIR+'labels/{}.csv'.format(template),parse_dates=['event_timestamp'])
    labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id').dropna()
    
    # prepare to extract by page
    talk_pages = talk_revisions.groupby('page_id')
    pages = page_revisions.groupby('page_id')
    
    features = []
    for tag_date, page_id, talk_page_id in tqdm(labels.values):
        lang_features = conv = talk_volume = {}
        
        tag_page_revisions = pages.get_group(page_id)
        tag_page_revisions = tag_page_revisions[tag_page_revisions.event_timestamp.dt.date <= tag_date]

        tag_talk_revisions = talk_pages.get_group(talk_page_id)
        tag_talk_revisions = tag_talk_revisions[tag_talk_revisions.event_timestamp.dt.date <= tag_date]
        
        # talk page; volume
        if len(tag_talk_revisions) > 0:
            tag_talk_revisions = calculate_page_metrics(tag_talk_revisions)
            talk_features = get_talk_features(tag_talk_revisions)
            talk_features['page_talk_ratio'] = len(tag_page_revisions)/len(tag_talk_revisions)

        # talk page; language
        if (talk_page_id,tag_date) in talk_text.index:
            talk_latest = talk_text.loc[(talk_page_id,tag_date)].values
            conv = conv_parser.format_conv(*talk_latest)
            lang_features = feature_extractor.get_language_features(conv)

        features.append({'page':page_id,'date':str(tag_date),
                         'conversation':list(conv),
                        'talk_volume':talk_features,
                        'talk_language':lang_features})
        
    with open(HOME +'features/talk_'+template+'.json','w') as f:
        json.dump(features,f,default=np_encoder)


advert
reading data


100%|██████████| 7570/7570 [09:36<00:00, 13.13it/s]  


autobiography
reading data


100%|██████████| 4224/4224 [04:07<00:00, 17.04it/s]


fanpov
reading data


100%|██████████| 721/721 [01:01<00:00, 11.73it/s]


weasel
reading data


100%|██████████| 1322/1322 [04:47<00:00,  4.59it/s]


peacock
reading data


100%|██████████| 5174/5174 [05:34<00:00, 15.45it/s] 


CPU times: user 27min 36s, sys: 49.5 s, total: 28min 26s
Wall time: 28min 18s


In [13]:
# preceding unsigned comment
# heading "section"
# section of current conversation