In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/home/christinedk/wp_internship/collaboration/')

In [2]:
from time  import time
from data_export import getTemplatesRegexRelaibility, getTemplatesRegex
from pyspark.sql.functions import udf, col, explode, regexp_replace

In [3]:
TEMPLATES = ['weasel','peacock','autobiography','advert','fanpov']
outputHDFS = 'tmp_talk'
!hadoop fs -mkdir $outputHDFS

In [None]:
## Find revisions with templates (generated by revision_history notebook)

revisions_with_template = spark.read.parquet('tmp/templates.parquet')

In [19]:
## Format page titles so that they will be recognised in the WikiText database

revisions_with_template = revisions_with_template.withColumn('page_title', regexp_replace('page_title', '_', ' '))
revisions_with_template = revisions_with_template.withColumn('page_title_talk',
                                                         concat(lit("Talk:"),col("page_title")))

pages_templates_subset = revisions_with_template.select('page_title_talk').distinct()
pages_templates_subset.createOrReplaceTempView('pages_templates_subset')

In [26]:
## Select subset of WikiText history for Talk pages of pages with templates

mediawiki_history_subset =  spark.sql('''
        SELECT w.revision_text, w.user_id, w.page_id, w.page_title, w.revision_id, w.revision_timestamp, w.revision_minor_edit
        FROM wmf.mediawiki_wikitext_history w
        WHERE w.snapshot ="2020-09" and w.wiki_db ="enwiki" AND w.page_namespace=1
        AND w.page_title IN (SELECT page_title_talk FROM pages_templates_subset)                   
        ''')
mediawiki_history_subset.cache()
mediawiki_history_subset.createOrReplaceTempView('mediawiki_history_subset')

In [27]:
for template in templates:
    try:
        t1 = time()
        print(template)
        df = revisions_with_template.where(revisions_with_template['col']==template) # revisions for specific template
        df.cache()
        t2 = time()
        print('read table, done',t2-t1)
        t1 = time()        
        page_ids = df.select('page_title_talk').distinct() # titles of revisions with specific template
        page_ids.createOrReplaceTempView('tmp_page_ids')
        reverts = spark.sql('''
        SELECT w.revision_text, w.user_id, w.page_id, w.page_title, w.revision_id, w.revision_timestamp, w.revision_minor_edit
        FROM mediawiki_history_subset w
        WHERE w.page_title IN (SELECT page_title_talk FROM tmp_page_ids) 
        ''') 
        reverts.repartition(1).write.format('json').save(outputHDFS+'/'+template,mode='overwrite')
        t2 = time()
        print('save table, done',t2-t1)
        t1 = time()   
        templateout = template.replace(' ','_')
        !hadoop fs -text "$outputHDFS/$template/*" > $outputHDFS-$template-meta-info.json
        t2 = time()
        print('-----',t2-t1)
    except Exception as e:
        print('error',e)

weasel
read table, done 0.11290431022644043
save table, done 6186.8497948646545
21/02/22 17:16:59 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 1460.379364490509
peacock
read table, done 0.08336520195007324
save table, done 1367.1914565563202
21/02/22 18:04:07 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 319.7799370288849
autobiography
read table, done 0.06857490539550781
save table, done 76.77443432807922
21/02/22 18:10:44 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 13.020766735076904
advert
read table, done 0.06012535095214844
save table, done 655.9413843154907
21/02/22 18:21:53 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 341.4727337360382
fanpov
read table, done 0.06925773620605469
save table, done 81.3644962310791
21/02/22 18:28:55 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 14.45546007156372


In [29]:
ls -lh /srv/home/christinedk/wp_internship/notebooks/tmp_talk*

-rw-r--r-- 1 christinedk wikidev  50G Feb 22 18:27 /srv/home/christinedk/wp_internship/notebooks/tmp_talk-advert-meta-info.json
-rw-r--r-- 1 christinedk wikidev 2.2G Feb 22 18:10 /srv/home/christinedk/wp_internship/notebooks/tmp_talk-autobiography-meta-info.json
-rw-r--r-- 1 christinedk wikidev 2.6G Feb 22 18:29 /srv/home/christinedk/wp_internship/notebooks/tmp_talk-fanpov-meta-info.json
-rw-r--r-- 1 christinedk wikidev  46G Feb 22 18:09 /srv/home/christinedk/wp_internship/notebooks/tmp_talk-peacock-meta-info.json
-rw-r--r-- 1 christinedk wikidev 230G Feb 22 17:41 /srv/home/christinedk/wp_internship/notebooks/tmp_talk-weasel-meta-info.json
