In [1]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
import json
import urllib
import argparse
from pyspark.ml.feature import CountVectorizerModel
from pyspark.ml.clustering import LDA


conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.driver.memory','240g'),
                                   ('spark.driver.maxResultSize', '32G'),
                                   ('spark.local.dir', '/scratch/tmp/'),
                                   ('spark.yarn.stagingDir', '/scratch/tmp/'),
                                   ('spark.sql.warehouse.dir', '/scratch/tmp/')
                                  ])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext


In [2]:
sites = ["enwiki", "eswiki", "itwiki", "dewiki", "frwiki", "cswiki", 
         "cawiki", "svwiki", "arwiki", 
             "elwiki",  "fiwiki", 
             "hewiki", "idwiki",  "kowiki", "nlwiki", "plwiki", 
             "ptwiki", "rowiki", "ruwiki", "sqwiki", "srwiki", "trwiki", 
             "ukwiki", "viwiki", "warwiki", "zhwiki", "fawiki", "jawiki"]


In [7]:
all_links_rdd = sc.emptyRDD()
for s in sites:
    links = spark.read.parquet("datasets/{}/anchors_info_qid.parquet".format(s))
    all_links_rdd = all_links_rdd.union(links\
                    .selectExpr("'{}' as site".format(s), "*").rdd)
    
articles = spark.createDataFrame(all_links_rdd).select("site", "qid").distinct()
articles

DataFrame[site: string, qid: string]

In [9]:
articles_count = articles.groupBy("site").agg(count("*").alias("total")).toPandas()

In [10]:
articles_count

Unnamed: 0,site,total
0,warwiki,1250531
1,cswiki,410485
2,ruwiki,1405781
3,ukwiki,905414
4,enwiki,5571501
5,nlwiki,1888944
6,idwiki,495316
7,sqwiki,70796
8,srwiki,579483
9,jawiki,1097468


In [11]:
articles_count.sort_values("total", ascending=False)

Unnamed: 0,site,total
4,enwiki,5571501
15,svwiki,3452729
17,dewiki,2042711
13,frwiki,2012837
5,nlwiki,1888944
18,eswiki,1461492
10,itwiki,1458314
2,ruwiki,1405781
22,plwiki,1288704
0,warwiki,1250531


In [None]:
articles_count.to_dict()