In [1]:
%load_ext autoreload
%autoreload 2

import sys
HOME='/srv/home/christinedk/wp_internship/'
sys.path.append(HOME + 'collaboration/')

In [13]:
import pandas as pd
from time  import time
from pyspark.sql.functions import udf, col, explode, regexp_replace
from math import log2

from data_export import getTemplatesRegexRelaibility, getTemplatesRegex

In [3]:
TEMPLATES = ['weasel','peacock','autobiography','advert','fanpov']
outputHDFS = 'page_history'
!hadoop fs -mkdir $outputHDFS

mkdir: `page_history': File exists


In [4]:
blp = pd.read_csv('/srv/home/christinedk/wp_internship/data/namespaces.csv')
blp_articles = blp.query('page_namespace == 0').drop_duplicates(subset=['page_id'])

In [5]:
blp_ids = blp_articles.page_id.tolist()

In [6]:
spark.sql('show partitions wmf.mediawiki_wikitext_history').show()

+--------------------+
|           partition|
+--------------------+
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
|snapshot=2021-01/...|
+--------------------+
only showing top 20 rows



# Get all revisions with templates from WikiText

In [7]:
snapshot ="2021-01"
wikidb = "enwiki"
wikitext_history = spark.sql('''SELECT page_id,page_title,revision_id,revision_text,user_id
    FROM wmf.mediawiki_wikitext_history 
    WHERE snapshot ="{snapshot}" and wiki_db ="{wikidb}"
    '''.format(wikidb=wikidb,snapshot=snapshot))

wikitext_history.cache()

DataFrame[page_id: bigint, page_title: string, revision_id: bigint, revision_text: string, user_id: bigint]

In [9]:
blp_history = wikitext_history.filter(col('page_id').isin(blp_ids))

In [12]:
revisions_with_template = blp_history.withColumn("templates",getTemplatesRegexRelaibility(col('revision_text')))

In [14]:
revisions_with_template = revisions_with_template.select(revisions_with_template.page_id,revisions_with_template.page_title,
                                             revisions_with_template.user_id,revisions_with_template.revision_id,
                                             explode(revisions_with_template.templates))\
                                             .withColumn('page_title', regexp_replace('page_title', ' ', '_'))\
#.filter(revisions_with_template['user_id']!=7328338)

In [16]:
## Persist outputs
revisions_with_template.write.parquet('page_history/templates.parquet',mode='overwrite')

# Get full edit history of pages and editors

In [8]:
revisions_with_template = spark.read.parquet('page_history/templates.parquet')

In [9]:
revisions_with_template.count()

595759

In [10]:
## Select subset of mediawiki history containing all page titles with revisions

pages_templates_subset = revisions_with_template.select('page_title').distinct()
pages_templates_subset.createOrReplaceTempView('pages_templates_subset')

mediawiki_history_subset =  spark.sql('''
        SELECT w.event_timestamp, w.page_title,w.page_id,w.page_namespace, 
        w.revision_id, w.revision_is_identity_reverted,  w.revision_is_identity_revert,
        w.revision_minor_edit, w.revision_text_bytes, 
        w.revision_first_identity_reverting_revision_id, w.revision_seconds_to_identity_revert,
        w.event_user_id,w.event_user_registration_timestamp, 
        w.event_user_is_anonymous,w.event_user_revision_count,

        w.event_comment
        FROM wmf.mediawiki_history w
        WHERE w.snapshot ="2020-09" and w.wiki_db ="enwiki" AND  
        w.event_entity = 'revision' AND w.page_title IN (
                    SELECT page_title FROM pages_templates_subset)                   
        ''')
mediawiki_history_subset.cache()
mediawiki_history_subset.createOrReplaceTempView('mediawiki_history_subset')

In [11]:
## Get full histories of these pages

for template in TEMPLATES:
    try:
        outputHDFS = 'page_history'
        t1 = time()
        print(template)
        df = revisions_with_template.where(revisions_with_template['col']==template) # 
        df.cache()
        t2 = time()
        print('read table, done',t2-t1)
        t1 = time()
        page_ids = df.select('page_title').distinct()
        page_ids.createOrReplaceTempView('tmp_page_ids')
        revision_ids = df.select('revision_id').distinct()
        revision_ids.createOrReplaceTempView('tmp_revision_ids')
        reverts = spark.sql('''
        SELECT w.event_timestamp, w.page_title,w.page_id, 
        w.page_namespace,w.revision_id, 
        w.revision_is_identity_reverted, w.revision_is_identity_revert,
        w.revision_minor_edit, w.revision_text_bytes, 
        w.revision_first_identity_reverting_revision_id,
        w.event_user_id,w.event_user_registration_timestamp, 
        w.event_user_is_anonymous,w.event_user_revision_count,
        CASE WHEN r.revision_id IS NOT NULL  THEN 1 ELSE 0 END has_template,
        w.event_comment
        FROM mediawiki_history_subset w 
        LEFT OUTER JOIN tmp_revision_ids r 
        ON (w.revision_id = r.revision_id)
        WHERE  w.page_title IN (SELECT page_title FROM tmp_page_ids) 
        ORDER BY page_title, w.revision_id
        ''') 
        reverts.write.format('json').save(outputHDFS+'/'+template,mode='overwrite')
        print('save table, done',t2-t1)
        t1 = time()   
        templateout = template.replace(' ','_')
        !hadoop fs -text "$outputHDFS/$template/*" > $outputHDFS-$template-meta-info.json
        t2 = time()
        print('-----',t2-t1)
        
    except Exception as e:
        print('error',e)

weasel
read table, done 0.32538866996765137
save table, done -0.0001862049102783203
21/03/23 13:25:03 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 8.036737442016602
peacock
read table, done 0.08455944061279297
save table, done -0.00018930435180664062
21/03/23 13:26:00 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 8.262783527374268
autobiography
read table, done 0.08574676513671875
save table, done -0.00019311904907226562
21/03/23 13:26:38 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 6.469390869140625
advert
read table, done 0.08558392524719238
save table, done -0.0001220703125
21/03/23 13:27:14 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 7.935754060745239
fanpov
read table, done 0.08685660362243652
save table, done -0.00018525123596191406
21/03/23 13:27:49 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 6.46190619468689


In [12]:
mv page_history* ../data/page_history/

In [12]:
ls

article_history.ipynb           extract_talk_features.ipynb
categories.ipynb                preprocess_editor_history.ipynb
[0m[01;34mdeprecated[0m/                     tag_addition_analysis.ipynb
EDA_revisions.ipynb             tags_meta.ipynb
editor_history.ipynb            talk_page_history.ipynb
extract_article_features.ipynb  user_histories_analysis.ipynb
extract_editor_features.ipynb
