In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/home/christinedk/wp_internship/collaboration/')

In [2]:
from time  import time
from data_export import getTemplatesRegexRelaibility, getTemplatesRegex
from pyspark.sql.functions import udf, col, explode, regexp_replace, concat, lit 

In [3]:
TEMPLATES = ['weasel','peacock','autobiography','advert','fanpov']
outputHDFS = 'talk'
!hadoop fs -mkdir $outputHDFS

mkdir: `talk': File exists


In [4]:
## Find revisions with templates (generated by revision_history notebook)

revisions_with_template = spark.read.parquet('tmp/templates.parquet')

In [5]:
## Format page titles so that they will be recognised in the WikiText database

revisions_with_template = revisions_with_template.withColumn('page_title', regexp_replace('page_title', '_', ' '))
revisions_with_template = revisions_with_template.withColumn('page_title_talk',
                                                         concat(lit("Talk:"),col("page_title")))

pages_templates_subset = revisions_with_template.select('page_title_talk').distinct()
pages_templates_subset.createOrReplaceTempView('pages_templates_subset')

In [6]:
## Select subset of WikiText history for Talk pages of pages with templates

mediawiki_history_subset =  spark.sql('''
        SELECT w.revision_text, w.user_id, w.page_id, w.page_title, w.revision_id, w.revision_timestamp, w.user_text,
        w.revision_minor_edit,revision_text_bytes, w.revision_parent_id
        FROM wmf.mediawiki_wikitext_history w
        WHERE w.snapshot ="2021-01" and w.wiki_db ="enwiki" AND w.page_namespace=1
        AND w.page_title IN (SELECT page_title_talk FROM pages_templates_subset)                   
        ''')
#mediawiki_history_subset.cache()
mediawiki_history_subset.createOrReplaceTempView('mediawiki_history_subset')

In [8]:
for template in TEMPLATES:
    try:
        t1 = time()
        print(template)
        df = revisions_with_template.where(revisions_with_template['col']==template) # revisions for specific template
        df.cache()
        t2 = time()
        print('read table, done',t2-t1)
        t1 = time()        
        page_ids = df.select('page_title_talk').distinct() # titles of revisions with specific template
        page_ids.createOrReplaceTempView('tmp_page_ids')
        reverts = spark.sql('''
        SELECT w.revision_text, w.user_id, w.page_id, w.page_title, w.revision_text_bytes, w.revision_id, w.revision_timestamp, w.revision_minor_edit, w.revision_parent_id, w.user_text
        FROM mediawiki_history_subset w
        WHERE w.page_title IN (SELECT page_title_talk FROM tmp_page_ids) 
        SORT BY page_id
        ''') 
        reverts.cache()
        reverts.write.format('json').save('talk-text/'+template,mode='overwrite')
        reverts.drop('revision_text').write.format('json').save('talk-activity/'+template,mode='overwrite')
        t2 = time()
        print('save table, done',t2-t1)
        t1 = time()   
        templateout = template.replace(' ','_')
        !hadoop fs -text "talk-text/$template/*" > $outputHDFS-text-$template-meta-info.json
        !hadoop fs -text "talk-activity/$template/*" > $outputHDFS-activity-$template-meta-info.json
        t2 = time()
        print('-----',t2-t1)
    except Exception as e:
        print('error',e)

weasel
read table, done 0.10941886901855469
save table, done 754.5110597610474
21/03/26 13:34:07 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
21/03/26 14:03:46 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 1794.709993839264
peacock
read table, done 0.0868842601776123
save table, done 99.95339035987854
21/03/26 14:05:43 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
21/03/26 14:10:27 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 291.4793632030487
autobiography
read table, done 0.09740376472473145
save table, done 2829.065163373947
21/03/26 14:57:42 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
21/03/26 14:57:59 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
----- 22.26380729675293
advert
read table, done 0.10959792137145996
save table, done 3170.939197540283
21/03/26 15:50:56 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
21/03/26 15:56:08 INFO compress.CodecPool: Go

In [9]:
ls -lh /srv/home/christinedk/wp_internship/notebooks/talk*

ls: cannot access '/srv/home/christinedk/wp_internship/notebooks/talk*': No such file or directory


In [10]:
mv talk* /srv/home/christinedk/wp_internship/data/talk_history/