In [1]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
import json
import argparse
%matplotlib inline

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.driver.memory','64g'),
                                   ('spark.driver.maxResultSize', '32G'),
                                   ('spark.local.dir', '/scratch/tmp/'),
                                   ('spark.yarn.stagingDir', '/scratch/tmp/'),
                                   ('spark.sql.warehouse.dir', '/scratch/tmp/')
                                  ])


# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext

In [2]:
spark

In [3]:
sites = ["jawiki", "cswiki", "cawiki", "svwiki", "arwiki", "dewiki", 
             "elwiki", "enwiki", "eswiki", "fawiki", "fiwiki", "frwiki", 
             "hewiki", "idwiki", "itwiki", "kowiki", "nlwiki", "plwiki", 
             "ptwiki", "rowiki", "ruwiki", "sqwiki", "srwiki", "trwiki", 
             "ukwiki", "viwiki", "warwiki", "zhwiki"]

In [4]:
all_links_enriched_rdd = sc.emptyRDD()
for s in sites:
    links = spark.read.parquet("datasets/{}/enriched_all_links.parquet".format(s))
    all_links_enriched_rdd = all_links_enriched_rdd.union(links.selectExpr("'{}' as site".format(s), "*").rdd)
    
all_links_enriched = spark.createDataFrame(all_links_enriched_rdd)
all_links_enriched

DataFrame[site: string, qid: string, links: array<string>]

In [5]:
all_links_rdd = sc.emptyRDD()
for s in sites:
    links = spark.read.parquet("datasets/{}/anchors_info_qid.parquet".format(s))
    all_links_rdd = all_links_rdd.union(links.filter("anchor not rlike '^[0-9]+$'").groupBy("qid")\
                    .agg(collect_list("destination_qid").alias("links"))
                    .selectExpr("'{}' as site".format(s), "qid", "links").rdd)
    
all_links = spark.createDataFrame(all_links_rdd)
all_links

DataFrame[site: string, qid: string, links: array<string>]

In [None]:
links_count_enriched = all_links_enriched.selectExpr("site", "SIZE(links) links_count").groupBy("site", "links_count")\
                .agg(count("*").alias("total")).toPandas()

links_count_enriched.head()

Unnamed: 0,site,links_count,total
0,jawiki,542,28
1,jawiki,669,18
2,jawiki,832,11
3,jawiki,1643,2
4,jawiki,2597,1


In [None]:
links_count = all_links.selectExpr("site", "SIZE(links) links_count").groupBy("site", "links_count")\
                .agg(count("*").alias("total")).toPandas()

links_count.head()

Unnamed: 0,site,links_count,total
0,jawiki,861,6
1,jawiki,669,9
2,jawiki,542,22
3,jawiki,832,6
4,jawiki,1848,1


In [12]:
links_count.to_pickle("temp_data/links_count.pkl")
links_count_enriched.to_pickle("temp_data/links_count_enriched.pkl")