In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, udf
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer
import string
import re

spark = SparkSession.builder.getOrCreate()

In [20]:
def saveMergedPublishers():
    dft = spark.read.load('../dataset/tokened/articles-training-bypublisher/')
    dfv = spark.read.load('../dataset/tokened/articles-validation-bypublisher/')
    mergedPublishers = dft.union(dfv)

    truthSchema = StructType([ \
            StructField("_id", IntegerType(), False), \
            StructField("_hyperpartisan", BooleanType(), False), \
            StructField("_labeled-by", StringType(), False)])

    tdft = spark.read \
            .format('com.databricks.spark.xml') \
            .options(rowTag='article') \
            .load('../dataset/ground-truth-training-bypublisher-20181122.xml', \
                  schema=truthSchema).dropna()

    tdfv = spark.read \
            .format('com.databricks.spark.xml') \
            .options(rowTag='article') \
            .load('../dataset/ground-truth-validation-bypublisher-20181122.xml', \
                  schema=truthSchema).dropna()

    mergedTruth = tdft.union(tdfv)
    merged = mergedPublishers.join(mergedTruth, on='_id', how='inner')
    merged.write.save('../dataset/merged/publisher', format='parquet', mode="error")

In [21]:
def saveMergedArticles():
    df = spark.read.load('../dataset/tokened/articles-training-byarticle/')

    truthSchema = StructType([ \
            StructField("_id", IntegerType(), False), \
            StructField("_hyperpartisan", BooleanType(), False), \
            StructField("_labeled-by", StringType(), False)])

    tdf = spark.read \
            .format('com.databricks.spark.xml') \
            .options(rowTag='article') \
            .load('../dataset/ground-truth-training-byarticle-20181122.xml', \
                  schema=truthSchema).dropna()

    merged = df.join(tdf, on='_id', how='inner')
    merged.write.save('../dataset/merged/article', format='parquet', mode="error")

In [2]:
def saveMergedPublisherTestAndValidation():
    dft = spark.read.load('../dataset/tokened/articles-training-bypublisher/')
    dfv = spark.read.load('../dataset/tokened/articles-validation-bypublisher/')

    truthSchema = StructType([ \
            StructField("_id", IntegerType(), False), \
            StructField("_hyperpartisan", BooleanType(), False), \
            StructField("_labeled-by", StringType(), False)])

    tdft = spark.read \
            .format('com.databricks.spark.xml') \
            .options(rowTag='article') \
            .load('../dataset/ground-truth-training-bypublisher-20181122.xml', \
                  schema=truthSchema).dropna()

    tdfv = spark.read \
            .format('com.databricks.spark.xml') \
            .options(rowTag='article') \
            .load('../dataset/ground-truth-validation-bypublisher-20181122.xml', \
                  schema=truthSchema).dropna()

    merged = dft.join(tdft, on='_id', how='inner')
    merged.write.save('../dataset/merged/publisherTest', format='parquet', mode="error") 
    merged = dfv.join(tdfv, on='_id', how='inner')
    merged.write.save('../dataset/merged/publisherValidation', format='parquet', mode="error")

In [22]:
def saveMergedEverything():
    dfp = spark.read.load('../dataset/merged/publisher/')
    dfa = spark.read.load('../dataset/merged/article/')
    merged = dfp.union(dfa)
    merged.write.save('../dataset/merged/everything', format='parquet', mode="error")

In [23]:
saveMergedPublishers()
saveMergedArticles()
saveMergedPublisherTestAndValidation()
saveMergedEverything()