In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
HOME = '/srv/home/christinedk/wp_internship/'
DATA_DIR = HOME + 'data/'

In [4]:
import pandas as pd
from dateutil import parser
import datetime
import numpy as np
import json
import matplotlib.pylab as plt
from tqdm import tqdm
from collections import defaultdict

import sys
sys.path.append('/srv/home/christinedk/wp_internship/collaboration/')
from features.article_history import *
from features.talk_history import *
from utils import read_revisions, np_encoder

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).e

In [5]:
ls /srv/home/christinedk/wp_internship/data/page_history/

page_history-advert-meta-info.json         page_history-peacock-meta-info.json
page_history-autobiography-meta-info.json  page_history-weasel-meta-info.json
page_history-fanpov-meta-info.json


# Extract Talk subset 

In [None]:
for template in ['autobiography','weasel','advert','fanpov','peacock']:
    print(template)

    article_talk_mappings = pd.read_csv(DATA_DIR+'article_talk_mappings/{}.csv'.format(template),
                                       usecols=['talk_page_id','article_page_id'])
    labels = pd.read_csv(DATA_DIR+'negative_labels/{}.csv'.format(template),parse_dates=['event_timestamp'])
    labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id')[['event_timestamp','page_id','talk_page_id']].dropna()
    print('number of labels: ',len(labels))
    
    page_labels = labels.groupby('talk_page_id')
    pages = labels.talk_page_id.unique()
    
    talk = {}
    with open(DATA_DIR + 'talk_history/talk-text-{}-meta-info.json'.format(template),'rb') as f:
        for line in f:
            snapshot=json.loads(line)
            page_id = snapshot['page_id']
            if page_id not in pages:
                continue

            snapshot_date = parser.parse(snapshot['revision_timestamp']).replace(tzinfo=None)
            page_lable_dates = page_labels.get_group(snapshot['page_id'])['event_timestamp']
            date_diffs = (snapshot_date - page_lable_dates).dt.days
            min_ind = date_diffs.idxmin()
            min_diff = date_diffs[min_ind]
            if 0 <= min_diff < talk.get((page_id,page_lable_dates[min_ind])[0],365):
                talk[page_id,page_lable_dates[min_ind]]=(min_diff,snapshot)

    print(len(talk))
    talk_dump = [{'talk_page_id':key[0],'event_timestamp':str(key[1]),**value[1]} for key, value in talk.items()]
    with open(DATA_DIR+'talk_history/talk-subset-negative-{}.json'.format(template),'w') as f:
        json.dump(talk_dump, f, default=np_encoder)

# Extract features

In [None]:
%%time 

conv_parser = ConvParser()
feature_extractor = FeatureExtractor()

for template in ['autobiography','fanpov','weasel','advert','peacock']:
    print(template)
    
    print('reading data')
    # read and format things
    page_revisions = read_revisions(DATA_DIR+'page_history/page_history-{}-meta-info.json'.format(template))
    talk_revisions = read_revisions(DATA_DIR+'talk_history/talk-activity-{}-meta-info.json'.format(template),
                                    rename=True)
    
    talk_text = pd.read_json(DATA_DIR+'talk_history/talk-subset-negative-{}.json'.format(template))
    talk_text['event_timestamp'] = pd.to_datetime(talk_text['event_timestamp'])
    talk_text = talk_text.set_index(['talk_page_id','event_timestamp'])[['revision_text','page_id']]

    article_talk_mappings = pd.read_csv(DATA_DIR+'article_talk_mappings/{}.csv'.format(template),
                                       usecols=['talk_page_id','article_page_id'])
    labels = pd.read_csv(DATA_DIR+'negative_labels/{}.csv'.format(template),
                         parse_dates=['event_timestamp'])
    labels = labels.join(article_talk_mappings.set_index('article_page_id')[['talk_page_id']],on='page_id')[['event_timestamp','page_id','talk_page_id']].dropna()
    
    # prepare to extract by page
    talk_pages = talk_revisions.groupby('page_id')
    pages = page_revisions.groupby('page_id')
    
    features = []
    counter = 0
    for tag_date, page_id, talk_page_id in tqdm(labels.values):
        lang_features = conv = talk_volume = {}
        
        tag_page_revisions = pages.get_group(page_id)
        tag_page_revisions = tag_page_revisions[tag_page_revisions.event_timestamp.dt.date <= tag_date]

        tag_talk_revisions = talk_pages.get_group(talk_page_id)
        tag_talk_revisions = tag_talk_revisions[tag_talk_revisions.event_timestamp.dt.date <= tag_date]
        
        # talk page; volume
        if len(tag_talk_revisions) > 0:
            tag_page_revisions = calculate_page_metrics(tag_talk_revisions)
            talk_features = get_talk_features(tag_talk_revisions)
            talk_features['page_talk_ratio'] = len(tag_page_revisions)/len(tag_talk_revisions)

        # talk page; language
        if (talk_page_id,tag_date) in talk_text.index:
            talk_latest = talk_text.loc[(talk_page_id,tag_date)].values
            conv = conv_parser.format_conv(*talk_latest)
            lang_features = feature_extractor.get_language_features(conv)

        features.append({'page':page_id,'date':str(tag_date),
                         'conversation':conv,
                        'talk_volume':talk_features,
                        'talk_language':lang_features})
        
    with open(HOME +'negative_features/talk_'+template+'.json','w') as f:
        json.dump(features,f,default=np_encoder)


autobiography
reading data


100%|██████████| 21116/21116 [03:53<00:00, 90.56it/s] 


fanpov
reading data


100%|██████████| 3605/3605 [00:43<00:00, 83.42it/s] 


weasel
reading data


100%|██████████| 6610/6610 [04:15<00:00, 25.88it/s] 


autobiography
reading data


100%|██████████| 21116/21116 [03:47<00:00, 92.96it/s] 


advert
reading data


100%|██████████| 37843/37843 [06:52<00:00, 91.78it/s] 


peacock
reading data


 81%|████████  | 20833/25863 [03:43<00:48, 102.76it/s]