In [1]:
import pyspark
import pyspark.sql
from pyspark.sql import *
from pyspark.sql.functions import *
import json
import urllib
import argparse
%matplotlib inline

conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.driver.memory','240g'),
                                   ('spark.driver.maxResultSize', '32G'),
                                   ('spark.local.dir', '/scratch/tmp/'),
                                   ('spark.yarn.stagingDir', '/scratch/tmp/')
                                  ])

# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()
# create the context
sc = spark.sparkContext

In [2]:
spark

In [3]:
sites = ["jawiki", "cswiki", "cawiki", "svwiki", "arwiki", "dewiki", 
             "elwiki", "enwiki", "eswiki", "fawiki", "fiwiki", "frwiki", 
             "hewiki", "idwiki", "itwiki", "kowiki", "nlwiki", "plwiki", 
             "ptwiki", "rowiki", "ruwiki", "sqwiki", "srwiki", "trwiki", 
             "ukwiki", "viwiki", "warwiki", "zhwiki"]

In [4]:
all_links_rdd = sc.emptyRDD()
for s in sites:
    links = spark.read.parquet("datasets/{}/anchors_info_qid.parquet".format(s))
    all_links_rdd = all_links_rdd.union(links.filter("anchor not rlike '^[0-9]+$'").groupBy("qid")\
                    .agg(collect_list("destination_qid").alias("links"))
                    .selectExpr("'{}' as site".format(s), "qid", "links").rdd)
    
all_links = spark.createDataFrame(all_links_rdd)
all_links

DataFrame[site: string, qid: string, links: array<string>]

In [5]:
# all_links = spark.createDataFrame(all_links_rdd)
# all_links

DataFrame[site: string, qid: string, links: array<string>]

In [6]:
qids_rdd = sc.emptyRDD()
for s in sites:
    qids = spark.read.parquet("datasets/{}/heldout_documents.parquet".format(s))\
            .selectExpr("*", "'{}' as site".format(s))
    qids_rdd = qids_rdd.union(qids.rdd)
    
qids = spark.createDataFrame(qids_rdd)
qids

DataFrame[qid: string, heldout: boolean, site: string]

In [7]:
training_qids = qids.where("heldout = FALSE")
testing_qids = qids.where("heldout = TRUE")

In [8]:
training = all_links.join(training_qids, (all_links.qid==training_qids.qid) & (all_links.site==training_qids.site))
training_self_loop = spark.createDataFrame(training.rdd.map(lambda r: Row(links=r.links+[r.qid])))
training_self_loop

DataFrame[links: array<string>]

In [8]:
testing = all_links.join(testing_qids, (all_links.qid==testing_qids.qid) & (all_links.site==testing_qids.site))
testing_self_loop = spark.createDataFrame(testing.rdd.map(lambda r: Row(site=r.site, qid=r.qid, links=r.links+[r.qid])))
testing_self_loop

DataFrame[links: array<string>, qid: string, site: string]

In [9]:
testing_self_loop.write.mode("overwrite").parquet("models/OnlyLinks/testing_self_loop.parquet")

In [17]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.clustering import LDA

wordsVector = CountVectorizer(inputCol="links", outputCol="features")
transformer = wordsVector.fit(training_self_loop)
result = transformer.transform(training_self_loop).cache()

result

DataFrame[links: array<string>, features: vector]

In [18]:
result.count()

32635430

In [19]:
transformer.write().overwrite().save("models/OnlyLinks/transformer.model")

In [20]:
result.select("features").write.mode("overwrite").parquet("models/OnlyLinks/traning_set.parquet")