In [1]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
import json
import urllib
import argparse
%matplotlib inline

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.driver.memory','240g'),
                                   ('spark.driver.maxResultSize', '32G'),
                                   ('spark.local.dir', '/scratch/tmp/'),
                                   ('spark.yarn.stagingDir', '/scratch/tmp/')
                                  ])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext

In [2]:
spark

In [3]:
sites = ["jawiki", "cswiki", "cawiki", "svwiki", "arwiki", "dewiki", 
             "elwiki", "enwiki", "eswiki", "fawiki", "fiwiki", "frwiki", 
             "hewiki", "idwiki", "itwiki", "kowiki", "nlwiki", "plwiki", 
             "ptwiki", "rowiki", "ruwiki", "sqwiki", "srwiki", "trwiki", 
             "ukwiki", "viwiki", "warwiki", "zhwiki"]

In [4]:
all_links_rdd = sc.emptyRDD()
for s in sites:
    links = spark.read.parquet("datasets/{}/anchors_info_qid.parquet".format(s))
    all_links_rdd = all_links_rdd.union(links.filter("anchor not rlike '^[0-9]+$'").groupBy("qid")\
                    .agg(collect_list("destination_qid").alias("links"))
                    .selectExpr("'{}' as site".format(s), "qid", "links").rdd)
    
all_links = spark.createDataFrame(all_links_rdd)
all_links

DataFrame[site: string, qid: string, links: array<string>]

In [5]:
all_links = spark.createDataFrame(all_links_rdd)
all_links

DataFrame[site: string, qid: string, links: array<string>]

In [6]:
training_qids_rdd = sc.emptyRDD()
for s in sites:
    qids = spark.read.parquet("datasets/{}/heldout_documents.parquet".format(s))\
            .where("heldout = FALSE").selectExpr("*", "'{}' as site".format(s))
    training_qids_rdd = training_qids_rdd.union(qids.rdd)
    
training_qids = spark.createDataFrame(training_qids_rdd)
training_qids

DataFrame[qid: string, heldout: boolean, site: string]

In [7]:
training = all_links.join(training_qids, (all_links.qid==training_qids.qid) & (all_links.site==training_qids.site))
training

DataFrame[site: string, qid: string, links: array<string>, qid: string, heldout: boolean, site: string]

In [8]:
merged_rdd = training.rdd.map(lambda r: (r.qid, r.links))\
        .reduceByKey(lambda a,b: a+b)\
        .map(lambda r: Row(qid=r[0], links=r[1]+[r[0]]))
        
all_links_merged = spark.createDataFrame(merged_rdd)
all_links_merged

DataFrame[links: array<string>, qid: string]

In [9]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA

wordsVector = CountVectorizer(inputCol="links", outputCol="features")
transformer = wordsVector.fit(all_links_merged)
result = transformer.transform(all_links_merged).cache()

result

DataFrame[links: array<string>, qid: string, features: vector]

In [10]:
result.count()

14347451

In [11]:
transformer.write().overwrite().save("models/OnlyLinksMerged/transformer.model")

In [13]:
result.select("features").write.mode('overwrite').parquet("models/OnlyLinksMerged/traning_set.parquet")