In [1]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
import json
import urllib
import argparse
%matplotlib inline

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.driver.memory','64g'),
                                   ('spark.driver.maxResultSize', '32G'),
                                   ('spark.local.dir', '/scratch/tmp/'),
                                   ('spark.yarn.stagingDir', '/scratch/tmp/'),
                                   ('spark.sql.warehouse.dir', '/scratch/tmp/')
                                  ])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext

In [2]:
spark

In [3]:
sites = ["jawiki", "cswiki", "cawiki", "svwiki", "arwiki", "dewiki", 
             "elwiki", "enwiki", "eswiki", "fawiki", "fiwiki", "frwiki", 
             "hewiki", "idwiki", "itwiki", "kowiki", "nlwiki", "plwiki", 
             "ptwiki", "rowiki", "ruwiki", "sqwiki", "srwiki", "trwiki", 
             "ukwiki", "viwiki", "warwiki", "zhwiki"]

In [19]:
import builtins

all_links_rdd = sc.emptyRDD()
for s in sites:
    qids = spark.read.parquet("datasets/{}/anchors_info_qid.parquet".format(s)).select("qid").distinct()
    articles_count = qids.count()
    one_percent = int(articles_count * 0.01)
    n = builtins.min(10000, one_percent)
    print("{} - Total articles: {}, Held-out size: {}".format(s, articles_count, n))
    heldout = qids.sample(False, n / articles_count, seed=123).sort(rand()).limit(n)
    qids.registerTempTable("qids")
    heldout.registerTempTable("heldout")
    query = """
    SELECT qids.qid, CASE WHEN heldout.qid is NOT NULL THEN TRUE ELSE FALSE END AS heldout 
    FROM qids 
    LEFT JOIN heldout
    ON qids.qid = heldout.qid
    """
    heldout_documents = spark.sql(query)
    heldout_documents.write.mode("overwrite").parquet("datasets/{}/heldout_documents.parquet".format(s))

jawiki - Total articles: 1097468, Held-out size: 10000
cswiki - Total articles: 410485, Held-out size: 4104
cawiki - Total articles: 611272, Held-out size: 6112
svwiki - Total articles: 3452729, Held-out size: 10000
arwiki - Total articles: 987435, Held-out size: 9874
dewiki - Total articles: 2042711, Held-out size: 10000
elwiki - Total articles: 163512, Held-out size: 1635
enwiki - Total articles: 5571501, Held-out size: 10000
eswiki - Total articles: 1461492, Held-out size: 10000
fawiki - Total articles: 673943, Held-out size: 6739
fiwiki - Total articles: 451007, Held-out size: 4510
frwiki - Total articles: 2012837, Held-out size: 10000
hewiki - Total articles: 239310, Held-out size: 2393
idwiki - Total articles: 495316, Held-out size: 4953
itwiki - Total articles: 1458314, Held-out size: 10000
kowiki - Total articles: 418045, Held-out size: 4180
nlwiki - Total articles: 1888944, Held-out size: 10000
plwiki - Total articles: 1288704, Held-out size: 10000
ptwiki - Total articles: 964

In [20]:
exit()